/pandas/core/frame.py
Python | 8037 lines | 8013 code | 10 blank | 14 comment | 24 complexity | f7a69c2d03a0237ebc71af5ae471cb7a MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- # pylint: disable=E1101
- # pylint: disable=W0212,W0703,W0622
- """
- DataFrame
- ---------
- An efficient 2D container for potentially mixed-type time series or other
- labeled data series.
- Similar to its R counterpart, data.frame, except providing automatic data
- alignment and a host of useful data manipulation methods having to do with the
- labeling information
- """
- from __future__ import division
- import collections
- from collections import OrderedDict
- import functools
- import itertools
- import sys
- import warnings
- from textwrap import dedent
- import numpy as np
- import numpy.ma as ma
- from pandas._libs import lib, algos as libalgos
- from pandas.util._decorators import (Appender, Substitution,
- rewrite_axis_style_signature,
- deprecate_kwarg)
- from pandas.util._validators import (validate_bool_kwarg,
- validate_axis_style_args)
- from pandas import compat
- from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u,
- PY36, raise_with_traceback, Iterator,
- string_and_binary_types)
- from pandas.compat.numpy import function as nv
- from pandas.core.dtypes.cast import (
- maybe_upcast,
- cast_scalar_to_array,
- infer_dtype_from_scalar,
- maybe_cast_to_datetime,
- maybe_infer_to_datetimelike,
- maybe_convert_platform,
- maybe_downcast_to_dtype,
- invalidate_string_dtypes,
- coerce_to_dtypes,
- maybe_upcast_putmask,
- find_common_type)
- from pandas.core.dtypes.common import (
- is_dict_like,
- is_datetime64tz_dtype,
- is_object_dtype,
- is_extension_type,
- is_extension_array_dtype,
- is_datetime64_any_dtype,
- is_bool_dtype,
- is_integer_dtype,
- is_float_dtype,
- is_integer,
- is_scalar,
- is_dtype_equal,
- needs_i8_conversion,
- infer_dtype_from_object,
- ensure_float64,
- ensure_int64,
- ensure_platform_int,
- is_list_like,
- is_nested_list_like,
- is_iterator,
- is_sequence,
- is_named_tuple)
- from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
- from pandas.core.dtypes.missing import isna, notna
- from pandas.core import algorithms
- from pandas.core import common as com
- from pandas.core import nanops
- from pandas.core import ops
- from pandas.core.accessor import CachedAccessor
- from pandas.core.arrays import Categorical, ExtensionArray
- from pandas.core.arrays.datetimelike import (
- DatetimeLikeArrayMixin as DatetimeLikeArray
- )
- from pandas.core.config import get_option
- from pandas.core.generic import NDFrame, _shared_docs
- from pandas.core.index import (Index, MultiIndex, ensure_index,
- ensure_index_from_sequences)
- from pandas.core.indexes import base as ibase
- from pandas.core.indexes.datetimes import DatetimeIndex
- from pandas.core.indexes.period import PeriodIndex
- from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
- check_bool_indexer)
- from pandas.core.internals import BlockManager
- from pandas.core.internals.construction import (
- masked_rec_array_to_mgr, get_names_from_index, to_arrays,
- reorder_arrays, init_ndarray, init_dict,
- arrays_to_mgr, sanitize_index)
- from pandas.core.series import Series
- from pandas.io.formats import console
- from pandas.io.formats import format as fmt
- from pandas.io.formats.printing import pprint_thing
- import pandas.plotting._core as gfx
- # ---------------------------------------------------------------------
- # Docstring templates
- _shared_doc_kwargs = dict(
- axes='index, columns', klass='DataFrame',
- axes_single_arg="{0 or 'index', 1 or 'columns'}",
- axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
- If 0 or 'index': apply function to each column.
- If 1 or 'columns': apply function to each row.""",
- optional_by="""
- by : str or list of str
- Name or list of names to sort by.
- - if `axis` is 0 or `'index'` then `by` may contain index
- levels and/or column labels
- - if `axis` is 1 or `'columns'` then `by` may contain column
- levels and/or index labels
- .. versionchanged:: 0.23.0
- Allow specifying index or column level names.""",
- versionadded_to_excel='',
- optional_labels="""labels : array-like, optional
- New labels / index to conform the axis specified by 'axis' to.""",
- optional_axis="""axis : int or str, optional
- Axis to target. Can be either the axis name ('index', 'columns')
- or number (0, 1).""",
- )
- _numeric_only_doc = """numeric_only : boolean, default None
- Include only float, int, boolean data. If None, will attempt to use
- everything, then use only numeric data
- """
- _merge_doc = """
- Merge DataFrame or named Series objects with a database-style join.
- The join is done on columns or indexes. If joining columns on
- columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
- on indexes or indexes on a column or columns, the index will be passed on.
- Parameters
- ----------%s
- right : DataFrame or named Series
- Object to merge with.
- how : {'left', 'right', 'outer', 'inner'}, default 'inner'
- Type of merge to be performed.
- * left: use only keys from left frame, similar to a SQL left outer join;
- preserve key order.
- * right: use only keys from right frame, similar to a SQL right outer join;
- preserve key order.
- * outer: use union of keys from both frames, similar to a SQL full outer
- join; sort keys lexicographically.
- * inner: use intersection of keys from both frames, similar to a SQL inner
- join; preserve the order of the left keys.
- on : label or list
- Column or index level names to join on. These must be found in both
- DataFrames. If `on` is None and not merging on indexes then this defaults
- to the intersection of the columns in both DataFrames.
- left_on : label or list, or array-like
- Column or index level names to join on in the left DataFrame. Can also
- be an array or list of arrays of the length of the left DataFrame.
- These arrays are treated as if they are columns.
- right_on : label or list, or array-like
- Column or index level names to join on in the right DataFrame. Can also
- be an array or list of arrays of the length of the right DataFrame.
- These arrays are treated as if they are columns.
- left_index : bool, default False
- Use the index from the left DataFrame as the join key(s). If it is a
- MultiIndex, the number of keys in the other DataFrame (either the index
- or a number of columns) must match the number of levels.
- right_index : bool, default False
- Use the index from the right DataFrame as the join key. Same caveats as
- left_index.
- sort : bool, default False
- Sort the join keys lexicographically in the result DataFrame. If False,
- the order of the join keys depends on the join type (how keyword).
- suffixes : tuple of (str, str), default ('_x', '_y')
- Suffix to apply to overlapping column names in the left and right
- side, respectively. To raise an exception on overlapping columns use
- (False, False).
- copy : bool, default True
- If False, avoid copy if possible.
- indicator : bool or str, default False
- If True, adds a column to output DataFrame called "_merge" with
- information on the source of each row.
- If string, column with information on source of each row will be added to
- output DataFrame, and column will be named value of string.
- Information column is Categorical-type and takes on a value of "left_only"
- for observations whose merge key only appears in 'left' DataFrame,
- "right_only" for observations whose merge key only appears in 'right'
- DataFrame, and "both" if the observation's merge key is found in both.
- validate : str, optional
- If specified, checks if merge is of specified type.
- * "one_to_one" or "1:1": check if merge keys are unique in both
- left and right datasets.
- * "one_to_many" or "1:m": check if merge keys are unique in left
- dataset.
- * "many_to_one" or "m:1": check if merge keys are unique in right
- dataset.
- * "many_to_many" or "m:m": allowed, but does not result in checks.
- .. versionadded:: 0.21.0
- Returns
- -------
- DataFrame
- A DataFrame of the two merged objects.
- See Also
- --------
- merge_ordered : Merge with optional filling/interpolation.
- merge_asof : Merge on nearest keys.
- DataFrame.join : Similar method using indices.
- Notes
- -----
- Support for specifying index levels as the `on`, `left_on`, and
- `right_on` parameters was added in version 0.23.0
- Support for merging named Series objects was added in version 0.24.0
- Examples
- --------
- >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
- ... 'value': [1, 2, 3, 5]})
- >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
- ... 'value': [5, 6, 7, 8]})
- >>> df1
- lkey value
- 0 foo 1
- 1 bar 2
- 2 baz 3
- 3 foo 5
- >>> df2
- rkey value
- 0 foo 5
- 1 bar 6
- 2 baz 7
- 3 foo 8
- Merge df1 and df2 on the lkey and rkey columns. The value columns have
- the default suffixes, _x and _y, appended.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey')
- lkey value_x rkey value_y
- 0 foo 1 foo 5
- 1 foo 1 foo 8
- 2 foo 5 foo 5
- 3 foo 5 foo 8
- 4 bar 2 bar 6
- 5 baz 3 baz 7
- Merge DataFrames df1 and df2 with specified left and right suffixes
- appended to any overlapping columns.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey',
- ... suffixes=('_left', '_right'))
- lkey value_left rkey value_right
- 0 foo 1 foo 5
- 1 foo 1 foo 8
- 2 foo 5 foo 5
- 3 foo 5 foo 8
- 4 bar 2 bar 6
- 5 baz 3 baz 7
- Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
- any overlapping columns.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
- Traceback (most recent call last):
- ...
- ValueError: columns overlap but no suffix specified:
- Index(['value'], dtype='object')
- """
- # -----------------------------------------------------------------------
- # DataFrame class
- class DataFrame(NDFrame):
- """
- Two-dimensional size-mutable, potentially heterogeneous tabular data
- structure with labeled axes (rows and columns). Arithmetic operations
- align on both row and column labels. Can be thought of as a dict-like
- container for Series objects. The primary pandas data structure.
- Parameters
- ----------
- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
- Dict can contain Series, arrays, constants, or list-like objects
- .. versionchanged :: 0.23.0
- If data is a dict, argument order is maintained for Python 3.6
- and later.
- index : Index or array-like
- Index to use for resulting frame. Will default to RangeIndex if
- no indexing information part of input data and no index provided
- columns : Index or array-like
- Column labels to use for resulting frame. Will default to
- RangeIndex (0, 1, 2, ..., n) if no column labels are provided
- dtype : dtype, default None
- Data type to force. Only a single dtype is allowed. If None, infer
- copy : boolean, default False
- Copy data from inputs. Only affects DataFrame / 2d ndarray input
- See Also
- --------
- DataFrame.from_records : Constructor from tuples, also record arrays.
- DataFrame.from_dict : From dicts of Series, arrays, or dicts.
- DataFrame.from_items : From sequence of (key, value) pairs
- read_csv, pandas.read_table, pandas.read_clipboard.
- Examples
- --------
- Constructing DataFrame from a dictionary.
- >>> d = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df = pd.DataFrame(data=d)
- >>> df
- col1 col2
- 0 1 3
- 1 2 4
- Notice that the inferred dtype is int64.
- >>> df.dtypes
- col1 int64
- col2 int64
- dtype: object
- To enforce a single dtype:
- >>> df = pd.DataFrame(data=d, dtype=np.int8)
- >>> df.dtypes
- col1 int8
- col2 int8
- dtype: object
- Constructing DataFrame from numpy ndarray:
- >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
- ... columns=['a', 'b', 'c'])
- >>> df2
- a b c
- 0 1 2 3
- 1 4 5 6
- 2 7 8 9
- """
- @property
- def _constructor(self):
- return DataFrame
- _constructor_sliced = Series
- _deprecations = NDFrame._deprecations | frozenset(
- ['get_value', 'set_value', 'from_csv', 'from_items'])
- _accessors = set()
- @property
- def _constructor_expanddim(self):
- from pandas.core.panel import Panel
- return Panel
- # ----------------------------------------------------------------------
- # Constructors
- def __init__(self, data=None, index=None, columns=None, dtype=None,
- copy=False):
- if data is None:
- data = {}
- if dtype is not None:
- dtype = self._validate_dtype(dtype)
- if isinstance(data, DataFrame):
- data = data._data
- if isinstance(data, BlockManager):
- mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
- dtype=dtype, copy=copy)
- elif isinstance(data, dict):
- mgr = init_dict(data, index, columns, dtype=dtype)
- elif isinstance(data, ma.MaskedArray):
- import numpy.ma.mrecords as mrecords
- # masked recarray
- if isinstance(data, mrecords.MaskedRecords):
- mgr = masked_rec_array_to_mgr(data, index, columns, dtype,
- copy)
- # a masked array
- else:
- mask = ma.getmaskarray(data)
- if mask.any():
- data, fill_value = maybe_upcast(data, copy=True)
- data.soften_mask() # set hardmask False if it was True
- data[mask] = fill_value
- else:
- data = data.copy()
- mgr = init_ndarray(data, index, columns, dtype=dtype,
- copy=copy)
- elif isinstance(data, (np.ndarray, Series, Index)):
- if data.dtype.names:
- data_columns = list(data.dtype.names)
- data = {k: data[k] for k in data_columns}
- if columns is None:
- columns = data_columns
- mgr = init_dict(data, index, columns, dtype=dtype)
- elif getattr(data, 'name', None) is not None:
- mgr = init_dict({data.name: data}, index, columns,
- dtype=dtype)
- else:
- mgr = init_ndarray(data, index, columns, dtype=dtype,
- copy=copy)
- # For data is list-like, or Iterable (will consume into list)
- elif (isinstance(data, compat.Iterable)
- and not isinstance(data, string_and_binary_types)):
- if not isinstance(data, compat.Sequence):
- data = list(data)
- if len(data) > 0:
- if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
- if is_named_tuple(data[0]) and columns is None:
- columns = data[0]._fields
- arrays, columns = to_arrays(data, columns, dtype=dtype)
- columns = ensure_index(columns)
- # set the index
- if index is None:
- if isinstance(data[0], Series):
- index = get_names_from_index(data)
- elif isinstance(data[0], Categorical):
- index = ibase.default_index(len(data[0]))
- else:
- index = ibase.default_index(len(data))
- mgr = arrays_to_mgr(arrays, columns, index, columns,
- dtype=dtype)
- else:
- mgr = init_ndarray(data, index, columns, dtype=dtype,
- copy=copy)
- else:
- mgr = init_dict({}, index, columns, dtype=dtype)
- else:
- try:
- arr = np.array(data, dtype=dtype, copy=copy)
- except (ValueError, TypeError) as e:
- exc = TypeError('DataFrame constructor called with '
- 'incompatible data and dtype: {e}'.format(e=e))
- raise_with_traceback(exc)
- if arr.ndim == 0 and index is not None and columns is not None:
- values = cast_scalar_to_array((len(index), len(columns)),
- data, dtype=dtype)
- mgr = init_ndarray(values, index, columns,
- dtype=values.dtype, copy=False)
- else:
- raise ValueError('DataFrame constructor not properly called!')
- NDFrame.__init__(self, mgr, fastpath=True)
- # ----------------------------------------------------------------------
- @property
- def axes(self):
- """
- Return a list representing the axes of the DataFrame.
- It has the row axis labels and column axis labels as the only members.
- They are returned in that order.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.axes
- [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
- dtype='object')]
- """
- return [self.index, self.columns]
- @property
- def shape(self):
- """
- Return a tuple representing the dimensionality of the DataFrame.
- See Also
- --------
- ndarray.shape
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.shape
- (2, 2)
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
- ... 'col3': [5, 6]})
- >>> df.shape
- (2, 3)
- """
- return len(self.index), len(self.columns)
- @property
- def _is_homogeneous_type(self):
- """
- Whether all the columns in a DataFrame have the same type.
- Returns
- -------
- bool
- Examples
- --------
- >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
- True
- >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
- False
- Items with the same type but different sizes are considered
- different types.
- >>> DataFrame({
- ... "A": np.array([1, 2], dtype=np.int32),
- ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
- False
- """
- if self._data.any_extension_types:
- return len({block.dtype for block in self._data.blocks}) == 1
- else:
- return not self._data.is_mixed_type
- # ----------------------------------------------------------------------
- # Rendering Methods
- def _repr_fits_vertical_(self):
- """
- Check length against max_rows.
- """
- max_rows = get_option("display.max_rows")
- return len(self) <= max_rows
- def _repr_fits_horizontal_(self, ignore_width=False):
- """
- Check if full repr fits in horizontal boundaries imposed by the display
- options width and max_columns.
- In case off non-interactive session, no boundaries apply.
- `ignore_width` is here so ipnb+HTML output can behave the way
- users expect. display.max_columns remains in effect.
- GH3541, GH3573
- """
- width, height = console.get_console_size()
- max_columns = get_option("display.max_columns")
- nb_columns = len(self.columns)
- # exceed max columns
- if ((max_columns and nb_columns > max_columns) or
- ((not ignore_width) and width and nb_columns > (width // 2))):
- return False
- # used by repr_html under IPython notebook or scripts ignore terminal
- # dims
- if ignore_width or not console.in_interactive_session():
- return True
- if (get_option('display.width') is not None or
- console.in_ipython_frontend()):
- # check at least the column row for excessive width
- max_rows = 1
- else:
- max_rows = get_option("display.max_rows")
- # when auto-detecting, so width=None and not in ipython front end
- # check whether repr fits horizontal by actually checking
- # the width of the rendered repr
- buf = StringIO()
- # only care about the stuff we'll actually print out
- # and to_string on entire frame may be expensive
- d = self
- if not (max_rows is None): # unlimited rows
- # min of two, where one may be None
- d = d.iloc[:min(max_rows, len(d))]
- else:
- return True
- d.to_string(buf=buf)
- value = buf.getvalue()
- repr_width = max(len(l) for l in value.split('\n'))
- return repr_width < width
- def _info_repr(self):
- """
- True if the repr should show the info view.
- """
- info_repr_option = (get_option("display.large_repr") == "info")
- return info_repr_option and not (self._repr_fits_horizontal_() and
- self._repr_fits_vertical_())
- def __unicode__(self):
- """
- Return a string representation for a particular DataFrame.
- Invoked by unicode(df) in py2 only. Yields a Unicode String in both
- py2/py3.
- """
- buf = StringIO(u(""))
- if self._info_repr():
- self.info(buf=buf)
- return buf.getvalue()
- max_rows = get_option("display.max_rows")
- max_cols = get_option("display.max_columns")
- show_dimensions = get_option("display.show_dimensions")
- if get_option("display.expand_frame_repr"):
- width, _ = console.get_console_size()
- else:
- width = None
- self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
- line_width=width, show_dimensions=show_dimensions)
- return buf.getvalue()
- def _repr_html_(self):
- """
- Return a html representation for a particular DataFrame.
- Mainly for IPython notebook.
- """
- if self._info_repr():
- buf = StringIO(u(""))
- self.info(buf=buf)
- # need to escape the <class>, should be the first line.
- val = buf.getvalue().replace('<', r'<', 1)
- val = val.replace('>', r'>', 1)
- return '<pre>' + val + '</pre>'
- if get_option("display.notebook_repr_html"):
- max_rows = get_option("display.max_rows")
- max_cols = get_option("display.max_columns")
- show_dimensions = get_option("display.show_dimensions")
- return self.to_html(max_rows=max_rows, max_cols=max_cols,
- show_dimensions=show_dimensions, notebook=True)
- else:
- return None
- @Substitution(header='Write out the column names. If a list of strings '
- 'is given, it is assumed to be aliases for the '
- 'column names')
- @Substitution(shared_params=fmt.common_docstring,
- returns=fmt.return_docstring)
- def to_string(self, buf=None, columns=None, col_space=None, header=True,
- index=True, na_rep='NaN', formatters=None, float_format=None,
- sparsify=None, index_names=True, justify=None,
- max_rows=None, max_cols=None, show_dimensions=False,
- decimal='.', line_width=None):
- """
- Render a DataFrame to a console-friendly tabular output.
- %(shared_params)s
- line_width : int, optional
- Width to wrap a line in characters.
- %(returns)s
- See Also
- --------
- to_html : Convert DataFrame to HTML.
- Examples
- --------
- >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
- >>> df = pd.DataFrame(d)
- >>> print(df.to_string())
- col1 col2
- 0 1 4
- 1 2 5
- 2 3 6
- """
- formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
- col_space=col_space, na_rep=na_rep,
- formatters=formatters,
- float_format=float_format,
- sparsify=sparsify, justify=justify,
- index_names=index_names,
- header=header, index=index,
- max_rows=max_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=decimal,
- line_width=line_width)
- formatter.to_string()
- if buf is None:
- result = formatter.buf.getvalue()
- return result
- # ----------------------------------------------------------------------
- @property
- def style(self):
- """
- Property returning a Styler object containing methods for
- building a styled HTML representation fo the DataFrame.
- See Also
- --------
- io.formats.style.Styler
- """
- from pandas.io.formats.style import Styler
- return Styler(self)
- def iteritems(self):
- r"""
- Iterator over (column name, Series) pairs.
- Iterates over the DataFrame columns, returning a tuple with
- the column name and the content as a Series.
- Yields
- ------
- label : object
- The column names for the DataFrame being iterated over.
- content : Series
- The column entries belonging to each label, as a Series.
- See Also
- --------
- DataFrame.iterrows : Iterate over DataFrame rows as
- (index, Series) pairs.
- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
- of the values.
- Examples
- --------
- >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
- ... 'population': [1864, 22000, 80000]},
- ... index=['panda', 'polar', 'koala'])
- >>> df
- species population
- panda bear 1864
- polar bear 22000
- koala marsupial 80000
- >>> for label, content in df.iteritems():
- ... print('label:', label)
- ... print('content:', content, sep='\n')
- ...
- label: species
- content:
- panda bear
- polar bear
- koala marsupial
- Name: species, dtype: object
- label: population
- content:
- panda 1864
- polar 22000
- koala 80000
- Name: population, dtype: int64
- """
- if self.columns.is_unique and hasattr(self, '_item_cache'):
- for k in self.columns:
- yield k, self._get_item_cache(k)
- else:
- for i, k in enumerate(self.columns):
- yield k, self._ixs(i, axis=1)
- def iterrows(self):
- """
- Iterate over DataFrame rows as (index, Series) pairs.
- Yields
- ------
- index : label or tuple of label
- The index of the row. A tuple for a `MultiIndex`.
- data : Series
- The data of the row as a Series.
- it : generator
- A generator that iterates over the rows of the frame.
- See Also
- --------
- itertuples : Iterate over DataFrame rows as namedtuples of the values.
- iteritems : Iterate over (column name, Series) pairs.
- Notes
- -----
- 1. Because ``iterrows`` returns a Series for each row,
- it does **not** preserve dtypes across the rows (dtypes are
- preserved across columns for DataFrames). For example,
- >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
- >>> row = next(df.iterrows())[1]
- >>> row
- int 1.0
- float 1.5
- Name: 0, dtype: float64
- >>> print(row['int'].dtype)
- float64
- >>> print(df['int'].dtype)
- int64
- To preserve dtypes while iterating over the rows, it is better
- to use :meth:`itertuples` which returns namedtuples of the values
- and which is generally faster than ``iterrows``.
- 2. You should **never modify** something you are iterating over.
- This is not guaranteed to work in all cases. Depending on the
- data types, the iterator returns a copy and not a view, and writing
- to it will have no effect.
- """
- columns = self.columns
- klass = self._constructor_sliced
- for k, v in zip(self.index, self.values):
- s = klass(v, index=columns, name=k)
- yield k, s
- def itertuples(self, index=True, name="Pandas"):
- """
- Iterate over DataFrame rows as namedtuples.
- Parameters
- ----------
- index : bool, default True
- If True, return the index as the first element of the tuple.
- name : str or None, default "Pandas"
- The name of the returned namedtuples or None to return regular
- tuples.
- Yields
- -------
- collections.namedtuple
- Yields a namedtuple for each row in the DataFrame with the first
- field possibly being the index and following fields being the
- column values.
- See Also
- --------
- DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
- pairs.
- DataFrame.iteritems : Iterate over (column name, Series) pairs.
- Notes
- -----
- The column names will be renamed to positional names if they are
- invalid Python identifiers, repeated, or start with an underscore.
- With a large number of columns (>255), regular tuples are returned.
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
- ... index=['dog', 'hawk'])
- >>> df
- num_legs num_wings
- dog 4 0
- hawk 2 2
- >>> for row in df.itertuples():
- ... print(row)
- ...
- Pandas(Index='dog', num_legs=4, num_wings=0)
- Pandas(Index='hawk', num_legs=2, num_wings=2)
- By setting the `index` parameter to False we can remove the index
- as the first element of the tuple:
- >>> for row in df.itertuples(index=False):
- ... print(row)
- ...
- Pandas(num_legs=4, num_wings=0)
- Pandas(num_legs=2, num_wings=2)
- With the `name` parameter set we set a custom name for the yielded
- namedtuples:
- >>> for row in df.itertuples(name='Animal'):
- ... print(row)
- ...
- Animal(Index='dog', num_legs=4, num_wings=0)
- Animal(Index='hawk', num_legs=2, num_wings=2)
- """
- arrays = []
- fields = list(self.columns)
- if index:
- arrays.append(self.index)
- fields.insert(0, "Index")
- # use integer indexing because of possible duplicate column names
- arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
- # Python 3 supports at most 255 arguments to constructor, and
- # things get slow with this many fields in Python 2
- if name is not None and len(self.columns) + index < 256:
- # `rename` is unsupported in Python 2.6
- try:
- itertuple = collections.namedtuple(name, fields, rename=True)
- return map(itertuple._make, zip(*arrays))
- except Exception:
- pass
- # fallback to regular tuples
- return zip(*arrays)
- items = iteritems
- def __len__(self):
- """
- Returns length of info axis, but here we use the index.
- """
- return len(self.index)
- def dot(self, other):
- """
- Compute the matrix mutiplication between the DataFrame and other.
- This method computes the matrix product between the DataFrame and the
- values of an other Series, DataFrame or a numpy array.
- It can also be called using ``self @ other`` in Python >= 3.5.
- Parameters
- ----------
- other : Series, DataFrame or array-like
- The other object to compute the matrix product with.
- Returns
- -------
- Series or DataFrame
- If other is a Series, return the matrix product between self and
- other as a Serie. If other is a DataFrame or a numpy.array, return
- the matrix product of self and other in a DataFrame of a np.array.
- See Also
- --------
- Series.dot: Similar method for Series.
- Notes
- -----
- The dimensions of DataFrame and other must be compatible in order to
- compute the matrix multiplication.
- The dot method for Series computes the inner product, instead of the
- matrix product here.
- Examples
- --------
- Here we multiply a DataFrame with a Series.
- >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
- >>> s = pd.Series([1, 1, 2, 1])
- >>> df.dot(s)
- 0 -4
- 1 5
- dtype: int64
- Here we multiply a DataFrame with another DataFrame.
- >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
- >>> df.dot(other)
- 0 1
- 0 1 4
- 1 2 2
- Note that the dot method give the same result as @
- >>> df @ other
- 0 1
- 0 1 4
- 1 2 2
- The dot method works also if other is an np.array.
- >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
- >>> df.dot(arr)
- 0 1
- 0 1 4
- 1 2 2
- """
- if isinstance(other, (Series, DataFrame)):
- common = self.columns.union(other.index)
- if (len(common) > len(self.columns) or
- len(common) > len(other.index)):
- raise ValueError('matrices are not aligned')
- left = self.reindex(columns=common, copy=False)
- right = other.reindex(index=common, copy=False)
- lvals = left.values
- rvals = right.values
- else:
- left = self
- lvals = self.values
- rvals = np.asarray(other)
- if lvals.shape[1] != rvals.shape[0]:
- raise ValueError('Dot product shape mismatch, '
- '{s} vs {r}'.format(s=lvals.shape,
- r=rvals.shape))
- if isinstance(other, DataFrame):
- return self._constructor(np.dot(lvals, rvals), index=left.index,
- columns=other.columns)
- elif isinstance(other, Series):
- return Series(np.dot(lvals, rvals), index=left.index)
- elif isinstance(rvals, (np.ndarray, Index)):
- result = np.dot(lvals, rvals)
- if result.ndim == 2:
- return self._constructor(result, index=left.index)
- else:
- return Series(result, index=left.index)
- else: # pragma: no cover
- raise TypeError('unsupported type: {oth}'.format(oth=type(other)))
- def __matmul__(self, other):
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- return self.dot(other)
- def __rmatmul__(self, other):
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- return self.T.dot(np.transpose(other)).T
- # ----------------------------------------------------------------------
- # IO methods (to / from other formats)
- @classmethod
- def from_dict(cls, data, orient='columns', dtype=None, columns=None):
- """
- Construct DataFrame from dict of array-like or dicts.
- Creates DataFrame object from dictionary by columns or by index
- allowing dtype specification.
- Parameters
- ----------
- data : dict
- Of the form {field : array-like} or {field : dict}.
- orient : {'columns', 'index'}, default 'columns'
- The "orientation" of the data. If the keys of the passed dict
- should be the columns of the resulting DataFrame, pass 'columns'
- (default). Otherwise if the keys should be rows, pass 'index'.
- dtype : dtype, default None
- Data type to force, otherwise infer.
- columns : list, default None
- Column labels to use when ``orient='index'``. Raises a ValueError
- if used with ``orient='columns'``.
- .. versionadded:: 0.23.0
- Returns
- -------
- DataFrame
- See Also
- --------
- DataFrame.from_records : DataFrame from ndarray (structured
- dtype), list of tuples, dict, or DataFrame.
- DataFrame : DataFrame object creation using constructor.
- Examples
- --------
- By default the keys of the dict become the DataFrame columns:
- >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
- >>> pd.DataFrame.from_dict(data)
- col_1 col_2
- 0 3 a
- 1 2 b
- 2 1 c
- 3 0 d
- Specify ``orient='index'`` to create the DataFrame using dictionary
- keys as rows:
- >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
- >>> pd.DataFrame.from_dict(data, orient='index')
- 0 1 2 3
- row_1 3 2 1 0
- row_2 a b c d
- When using the 'index' orientation, the column names can be
- specified manually:
- >>> pd.DataFrame.from_dict(data, orient='index',
- ... columns=['A', 'B', 'C', 'D'])
- A B C D
- row_1 3 2 1 0
- row_2 a b c d
- """
- index = None
- orient = orient.lower()
- if orient == 'index':
- if len(data) > 0:
- # TODO speed up Series case
- if isinstance(list(data.values())[0], (Series, dict)):
- data = _from_nested_dict(data)
- else:
- data, index = list(data.values()), list(data.keys())
- elif orient == 'columns':
- if columns is not None:
- raise ValueError("cannot use columns parameter with "
- "orient='columns'")
- else: # pragma: no cover
- raise ValueError('only recognize index or columns for orient')
- return cls(data, index=index, columns=columns, dtype=dtype)
- def to_numpy(self, dtype=None, copy=False):
- """
- Convert the DataFrame to a NumPy array.
- .. versionadded:: 0.24.0
- By default, the dtype of the returned array will be the common NumPy
- dtype of all types in the DataFrame. For example, if the dtypes are
- ``float16`` and ``float32``, the results dtype will be ``float32``.
- This may require copying data and coercing values, which may be
- expensive.
- Parameters
- ----------
- dtype : str or numpy.dtype, optional
- The dtype to pass to :meth:`numpy.asarray`
- copy : bool, default False
- Whether to ensure that the returned value is a not a view on
- another array. Note that ``copy=False`` does not *ensure* that
- ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
- a copy is made, even if not strictly necessary.
- Returns
- -------
- numpy.ndarray
- See Also
- --------
- Series.to_numpy : Similar method for Series.
- Examples
- --------
- >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
- array([[1, 3],
- [2, 4]])
- With heterogenous data, the lowest common type will have to
- be used.
- >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
- >>> df.to_numpy()
- array([[1. , 3. ],
- [2. , 4.5]])
- For a mix of numeric and non-numeric types, the output array will
- have object dtype.
- >>> df['C'] = pd.date_range('2000', periods=2)
- >>> df.to_numpy()
- array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
- [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
- """
- result = np.array(self.values, dtype=dtype, copy=copy)
- return result
- def to_dict(self, orient='dict', into=dict):
- """
- Convert the DataFrame to a dictionary.
- The type of the key-value pairs can be customized with the parameters
- (see below).
- Parameters
- ----------
- orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
- Determines the type of the values of the dictionary.
- - 'dict' (default) : dict like {column -> {index -> value}}
- - 'list' : dict like {column -> [values]}
- - 'series' : dict like {column -> Series(values)}
- - 'split' : dict like
- {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
- - 'records' : list like
- [{column -> value}, ... , {column -> value}]
- - 'index' : dict like {index -> {column -> value}}
- Abbreviations are allowed. `s` indicates `series` and `sp`
- indicates `split`.
- into : class, default dict
- The collections.Mapping subclass used for all Mappings
- in the return value. Can be the actual class or an empty
- instance of the mapping type you want. If you want a
- collections.defaultdict, you must pass it initialized.
- .. versionadded:: 0.21.0
- Returns
- -------
- dict, list or collections.Mapping
- Return a collections.Mapping object representing the DataFrame.
- The resulting transformation depends on the `orient` parameter.
- See Also
- --------
- DataFrame.from_dict: Create a DataFrame from a dictionary.
- DataFrame.to_json: Convert a DataFrame to JSON format.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2],
- ... 'col2': [0.5, 0.75]},
- ... index=['row1', 'row2'])
- >>> df
- col1 col2
- row1 1 0.50
- row2 2 0.75
- >>> df.to_dict()
- {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
- You can specify the return orientation.
- >>> df.to_dict('series')
- {'col1': row1 1
- row2 2
- Name: col1, dtype: int64,
- 'col2': row1 0.50
- row2 0.75
- Name: col2, dtype: float64}
- >>> df.to_dict('split')
- {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
- 'data': [[1, 0.5], [2, 0.75]]}
- >>> df.to_dict('records')
- [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
- >>> df.to_dict('index')
- {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
- You can also specify the mapping type.
- >>> from collections import OrderedDict, defaultdict
- >>> df.to_dict(into=OrderedDict)
- OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
- ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
- If you want a `defaultdict`, you need to initialize it:
- >>> dd = defaultdict(list)
- >>> df.to_dict('records', into=dd)
- [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
- defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
- """
- if not self.columns.is_unique:
- warnings.warn("DataFrame columns are not unique, some "
- "columns will be omitted.", UserWarning,
- stacklevel=2)
- # GH16122
- into_c = com.standardize_mapping(into)
- if orient.lower().startswith('d'):
- return into_c(
- (k, v.to_dict(into)) for k, v in compat.iteritems(self))
- elif orient.lower().startswith('l'):
- return into_c((k, v.tolist()) for k, v in compat.iteritems(self))
- elif orient.lower().startswith('sp'):
- return into_c((('index', self.index.tolist()),
- ('columns', self.columns.tolist()),
- ('data', [
- list(map(com.maybe_box_datetimelike, t))
- for t in self.itertuples(index=False, name=None)
- ])))
- elif orient.lower().startswith('s'):
- return into_c((k, com.maybe_box_datetimelike(v))
- for k, v in compat.iteritems(self))
- elif orient.lower().startswith('r'):
- columns = self.columns.tolist()
- rows = (dict(zip(columns, row))
- for row in self.itertuples(index=False, name=None))
- return [
- into_c((k, com.maybe_box_datetimelike(v))
- for k, v in compat.iteritems(row))
- for row in rows]
- elif orient.lower().startswith('i'):
- if not self.index.is_unique:
- raise ValueError(
- "DataFrame index must be unique for orient='index'."
- )
- return into_c((t[0], dict(zip(self.columns, t[1:])))
- for t in self.itertuples(name=None))
- else:
- raise ValueError("orient '{o}' not understood".format(o=orient))
- def to_gbq(self, destination_table, project_id=None, chunksize=None,
- reauth=False, if_exists='fail', auth_local_webserver=False,
- table_schema=None, location=None, progress_bar=True,
- credentials=None, verbose=None, private_key=None):
- """
- Write a DataFrame to a Google BigQuery table.
- This function requires the `pandas-gbq package
- <https://pandas-gbq.readthedocs.io>`__.
- See the `How to authenticate with Google BigQuery
- <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
- guide for authentication instructions.
- Parameters
- ----------
- destination_table : str
- Name of table to be written, in the form ``dataset.tablename``.
- project_id : str, optional
- Google BigQuery Account project ID. Optional when available from
- the environment.
- chunksize : int, optional
- Number of rows to be inserted in each chunk from the dataframe.
- Set to ``None`` to load the whole dataframe at once.
- reauth : bool, default False
- Force Google BigQuery to re-authenticate the user. This is useful
- if multiple accounts are used.
- if_exists : str, default 'fail'
- Behavior when the destination table exists. Value can be one of:
- ``'fail'``
- If table exists, do nothing.
- ``'replace'``
- If table exists, drop it, recreate it, and insert data.
- ``'append'``
- If table exists, insert data. Create if does not exist.
- auth_local_webserver : bool, default False
- Use the `local webserver flow`_ instead of the `console flow`_
- when getting user credentials.
- .. _local webserver flow:
- http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
- .. _console flow:
- http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
- *New in version 0.2.0 of pandas-gbq*.
- table_schema : list of dicts, optional
- List of BigQuery table fields to which according DataFrame
- columns conform to, e.g. ``[{'name': 'col1', 'type':
- 'STRING'},...]``. If schema is not provided, it will be
- generated according to dtypes of DataFrame columns. See
- BigQuery API documentation on available names of a field.
- *New in version 0.3.1 of pandas-gbq*.
- location : str, optional
- Location where the load job should run. See the `BigQuery locations
- documentation
- <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
- list of available locations. The location must match that of the
- target dataset.
- *New in version 0.5.0 of pandas-gbq*.
- progress_bar : bool, default True
- Use the library `tqdm` to show the progress bar for the upload,
- chunk by chunk.
- *New in version 0.5.0 of pandas-gbq*.
- credentials : google.auth.credentials.Credentials, optional
- Credentials for a…
Large files files are truncated, but you can click here to view the full file