PageRenderTime 59ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/core/frame.py

http://github.com/wesm/pandas
Python | 8037 lines | 8013 code | 10 blank | 14 comment | 24 complexity | f7a69c2d03a0237ebc71af5ae471cb7a MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. # pylint: disable=E1101
  2. # pylint: disable=W0212,W0703,W0622
  3. """
  4. DataFrame
  5. ---------
  6. An efficient 2D container for potentially mixed-type time series or other
  7. labeled data series.
  8. Similar to its R counterpart, data.frame, except providing automatic data
  9. alignment and a host of useful data manipulation methods having to do with the
  10. labeling information
  11. """
  12. from __future__ import division
  13. import collections
  14. from collections import OrderedDict
  15. import functools
  16. import itertools
  17. import sys
  18. import warnings
  19. from textwrap import dedent
  20. import numpy as np
  21. import numpy.ma as ma
  22. from pandas._libs import lib, algos as libalgos
  23. from pandas.util._decorators import (Appender, Substitution,
  24. rewrite_axis_style_signature,
  25. deprecate_kwarg)
  26. from pandas.util._validators import (validate_bool_kwarg,
  27. validate_axis_style_args)
  28. from pandas import compat
  29. from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u,
  30. PY36, raise_with_traceback, Iterator,
  31. string_and_binary_types)
  32. from pandas.compat.numpy import function as nv
  33. from pandas.core.dtypes.cast import (
  34. maybe_upcast,
  35. cast_scalar_to_array,
  36. infer_dtype_from_scalar,
  37. maybe_cast_to_datetime,
  38. maybe_infer_to_datetimelike,
  39. maybe_convert_platform,
  40. maybe_downcast_to_dtype,
  41. invalidate_string_dtypes,
  42. coerce_to_dtypes,
  43. maybe_upcast_putmask,
  44. find_common_type)
  45. from pandas.core.dtypes.common import (
  46. is_dict_like,
  47. is_datetime64tz_dtype,
  48. is_object_dtype,
  49. is_extension_type,
  50. is_extension_array_dtype,
  51. is_datetime64_any_dtype,
  52. is_bool_dtype,
  53. is_integer_dtype,
  54. is_float_dtype,
  55. is_integer,
  56. is_scalar,
  57. is_dtype_equal,
  58. needs_i8_conversion,
  59. infer_dtype_from_object,
  60. ensure_float64,
  61. ensure_int64,
  62. ensure_platform_int,
  63. is_list_like,
  64. is_nested_list_like,
  65. is_iterator,
  66. is_sequence,
  67. is_named_tuple)
  68. from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
  69. from pandas.core.dtypes.missing import isna, notna
  70. from pandas.core import algorithms
  71. from pandas.core import common as com
  72. from pandas.core import nanops
  73. from pandas.core import ops
  74. from pandas.core.accessor import CachedAccessor
  75. from pandas.core.arrays import Categorical, ExtensionArray
  76. from pandas.core.arrays.datetimelike import (
  77. DatetimeLikeArrayMixin as DatetimeLikeArray
  78. )
  79. from pandas.core.config import get_option
  80. from pandas.core.generic import NDFrame, _shared_docs
  81. from pandas.core.index import (Index, MultiIndex, ensure_index,
  82. ensure_index_from_sequences)
  83. from pandas.core.indexes import base as ibase
  84. from pandas.core.indexes.datetimes import DatetimeIndex
  85. from pandas.core.indexes.period import PeriodIndex
  86. from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
  87. check_bool_indexer)
  88. from pandas.core.internals import BlockManager
  89. from pandas.core.internals.construction import (
  90. masked_rec_array_to_mgr, get_names_from_index, to_arrays,
  91. reorder_arrays, init_ndarray, init_dict,
  92. arrays_to_mgr, sanitize_index)
  93. from pandas.core.series import Series
  94. from pandas.io.formats import console
  95. from pandas.io.formats import format as fmt
  96. from pandas.io.formats.printing import pprint_thing
  97. import pandas.plotting._core as gfx
  98. # ---------------------------------------------------------------------
  99. # Docstring templates
  100. _shared_doc_kwargs = dict(
  101. axes='index, columns', klass='DataFrame',
  102. axes_single_arg="{0 or 'index', 1 or 'columns'}",
  103. axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
  104. If 0 or 'index': apply function to each column.
  105. If 1 or 'columns': apply function to each row.""",
  106. optional_by="""
  107. by : str or list of str
  108. Name or list of names to sort by.
  109. - if `axis` is 0 or `'index'` then `by` may contain index
  110. levels and/or column labels
  111. - if `axis` is 1 or `'columns'` then `by` may contain column
  112. levels and/or index labels
  113. .. versionchanged:: 0.23.0
  114. Allow specifying index or column level names.""",
  115. versionadded_to_excel='',
  116. optional_labels="""labels : array-like, optional
  117. New labels / index to conform the axis specified by 'axis' to.""",
  118. optional_axis="""axis : int or str, optional
  119. Axis to target. Can be either the axis name ('index', 'columns')
  120. or number (0, 1).""",
  121. )
  122. _numeric_only_doc = """numeric_only : boolean, default None
  123. Include only float, int, boolean data. If None, will attempt to use
  124. everything, then use only numeric data
  125. """
  126. _merge_doc = """
  127. Merge DataFrame or named Series objects with a database-style join.
  128. The join is done on columns or indexes. If joining columns on
  129. columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
  130. on indexes or indexes on a column or columns, the index will be passed on.
  131. Parameters
  132. ----------%s
  133. right : DataFrame or named Series
  134. Object to merge with.
  135. how : {'left', 'right', 'outer', 'inner'}, default 'inner'
  136. Type of merge to be performed.
  137. * left: use only keys from left frame, similar to a SQL left outer join;
  138. preserve key order.
  139. * right: use only keys from right frame, similar to a SQL right outer join;
  140. preserve key order.
  141. * outer: use union of keys from both frames, similar to a SQL full outer
  142. join; sort keys lexicographically.
  143. * inner: use intersection of keys from both frames, similar to a SQL inner
  144. join; preserve the order of the left keys.
  145. on : label or list
  146. Column or index level names to join on. These must be found in both
  147. DataFrames. If `on` is None and not merging on indexes then this defaults
  148. to the intersection of the columns in both DataFrames.
  149. left_on : label or list, or array-like
  150. Column or index level names to join on in the left DataFrame. Can also
  151. be an array or list of arrays of the length of the left DataFrame.
  152. These arrays are treated as if they are columns.
  153. right_on : label or list, or array-like
  154. Column or index level names to join on in the right DataFrame. Can also
  155. be an array or list of arrays of the length of the right DataFrame.
  156. These arrays are treated as if they are columns.
  157. left_index : bool, default False
  158. Use the index from the left DataFrame as the join key(s). If it is a
  159. MultiIndex, the number of keys in the other DataFrame (either the index
  160. or a number of columns) must match the number of levels.
  161. right_index : bool, default False
  162. Use the index from the right DataFrame as the join key. Same caveats as
  163. left_index.
  164. sort : bool, default False
  165. Sort the join keys lexicographically in the result DataFrame. If False,
  166. the order of the join keys depends on the join type (how keyword).
  167. suffixes : tuple of (str, str), default ('_x', '_y')
  168. Suffix to apply to overlapping column names in the left and right
  169. side, respectively. To raise an exception on overlapping columns use
  170. (False, False).
  171. copy : bool, default True
  172. If False, avoid copy if possible.
  173. indicator : bool or str, default False
  174. If True, adds a column to output DataFrame called "_merge" with
  175. information on the source of each row.
  176. If string, column with information on source of each row will be added to
  177. output DataFrame, and column will be named value of string.
  178. Information column is Categorical-type and takes on a value of "left_only"
  179. for observations whose merge key only appears in 'left' DataFrame,
  180. "right_only" for observations whose merge key only appears in 'right'
  181. DataFrame, and "both" if the observation's merge key is found in both.
  182. validate : str, optional
  183. If specified, checks if merge is of specified type.
  184. * "one_to_one" or "1:1": check if merge keys are unique in both
  185. left and right datasets.
  186. * "one_to_many" or "1:m": check if merge keys are unique in left
  187. dataset.
  188. * "many_to_one" or "m:1": check if merge keys are unique in right
  189. dataset.
  190. * "many_to_many" or "m:m": allowed, but does not result in checks.
  191. .. versionadded:: 0.21.0
  192. Returns
  193. -------
  194. DataFrame
  195. A DataFrame of the two merged objects.
  196. See Also
  197. --------
  198. merge_ordered : Merge with optional filling/interpolation.
  199. merge_asof : Merge on nearest keys.
  200. DataFrame.join : Similar method using indices.
  201. Notes
  202. -----
  203. Support for specifying index levels as the `on`, `left_on`, and
  204. `right_on` parameters was added in version 0.23.0
  205. Support for merging named Series objects was added in version 0.24.0
  206. Examples
  207. --------
  208. >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
  209. ... 'value': [1, 2, 3, 5]})
  210. >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
  211. ... 'value': [5, 6, 7, 8]})
  212. >>> df1
  213. lkey value
  214. 0 foo 1
  215. 1 bar 2
  216. 2 baz 3
  217. 3 foo 5
  218. >>> df2
  219. rkey value
  220. 0 foo 5
  221. 1 bar 6
  222. 2 baz 7
  223. 3 foo 8
  224. Merge df1 and df2 on the lkey and rkey columns. The value columns have
  225. the default suffixes, _x and _y, appended.
  226. >>> df1.merge(df2, left_on='lkey', right_on='rkey')
  227. lkey value_x rkey value_y
  228. 0 foo 1 foo 5
  229. 1 foo 1 foo 8
  230. 2 foo 5 foo 5
  231. 3 foo 5 foo 8
  232. 4 bar 2 bar 6
  233. 5 baz 3 baz 7
  234. Merge DataFrames df1 and df2 with specified left and right suffixes
  235. appended to any overlapping columns.
  236. >>> df1.merge(df2, left_on='lkey', right_on='rkey',
  237. ... suffixes=('_left', '_right'))
  238. lkey value_left rkey value_right
  239. 0 foo 1 foo 5
  240. 1 foo 1 foo 8
  241. 2 foo 5 foo 5
  242. 3 foo 5 foo 8
  243. 4 bar 2 bar 6
  244. 5 baz 3 baz 7
  245. Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
  246. any overlapping columns.
  247. >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
  248. Traceback (most recent call last):
  249. ...
  250. ValueError: columns overlap but no suffix specified:
  251. Index(['value'], dtype='object')
  252. """
  253. # -----------------------------------------------------------------------
  254. # DataFrame class
  255. class DataFrame(NDFrame):
  256. """
  257. Two-dimensional size-mutable, potentially heterogeneous tabular data
  258. structure with labeled axes (rows and columns). Arithmetic operations
  259. align on both row and column labels. Can be thought of as a dict-like
  260. container for Series objects. The primary pandas data structure.
  261. Parameters
  262. ----------
  263. data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
  264. Dict can contain Series, arrays, constants, or list-like objects
  265. .. versionchanged :: 0.23.0
  266. If data is a dict, argument order is maintained for Python 3.6
  267. and later.
  268. index : Index or array-like
  269. Index to use for resulting frame. Will default to RangeIndex if
  270. no indexing information part of input data and no index provided
  271. columns : Index or array-like
  272. Column labels to use for resulting frame. Will default to
  273. RangeIndex (0, 1, 2, ..., n) if no column labels are provided
  274. dtype : dtype, default None
  275. Data type to force. Only a single dtype is allowed. If None, infer
  276. copy : boolean, default False
  277. Copy data from inputs. Only affects DataFrame / 2d ndarray input
  278. See Also
  279. --------
  280. DataFrame.from_records : Constructor from tuples, also record arrays.
  281. DataFrame.from_dict : From dicts of Series, arrays, or dicts.
  282. DataFrame.from_items : From sequence of (key, value) pairs
  283. read_csv, pandas.read_table, pandas.read_clipboard.
  284. Examples
  285. --------
  286. Constructing DataFrame from a dictionary.
  287. >>> d = {'col1': [1, 2], 'col2': [3, 4]}
  288. >>> df = pd.DataFrame(data=d)
  289. >>> df
  290. col1 col2
  291. 0 1 3
  292. 1 2 4
  293. Notice that the inferred dtype is int64.
  294. >>> df.dtypes
  295. col1 int64
  296. col2 int64
  297. dtype: object
  298. To enforce a single dtype:
  299. >>> df = pd.DataFrame(data=d, dtype=np.int8)
  300. >>> df.dtypes
  301. col1 int8
  302. col2 int8
  303. dtype: object
  304. Constructing DataFrame from numpy ndarray:
  305. >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
  306. ... columns=['a', 'b', 'c'])
  307. >>> df2
  308. a b c
  309. 0 1 2 3
  310. 1 4 5 6
  311. 2 7 8 9
  312. """
  313. @property
  314. def _constructor(self):
  315. return DataFrame
  316. _constructor_sliced = Series
  317. _deprecations = NDFrame._deprecations | frozenset(
  318. ['get_value', 'set_value', 'from_csv', 'from_items'])
  319. _accessors = set()
  320. @property
  321. def _constructor_expanddim(self):
  322. from pandas.core.panel import Panel
  323. return Panel
  324. # ----------------------------------------------------------------------
  325. # Constructors
  326. def __init__(self, data=None, index=None, columns=None, dtype=None,
  327. copy=False):
  328. if data is None:
  329. data = {}
  330. if dtype is not None:
  331. dtype = self._validate_dtype(dtype)
  332. if isinstance(data, DataFrame):
  333. data = data._data
  334. if isinstance(data, BlockManager):
  335. mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
  336. dtype=dtype, copy=copy)
  337. elif isinstance(data, dict):
  338. mgr = init_dict(data, index, columns, dtype=dtype)
  339. elif isinstance(data, ma.MaskedArray):
  340. import numpy.ma.mrecords as mrecords
  341. # masked recarray
  342. if isinstance(data, mrecords.MaskedRecords):
  343. mgr = masked_rec_array_to_mgr(data, index, columns, dtype,
  344. copy)
  345. # a masked array
  346. else:
  347. mask = ma.getmaskarray(data)
  348. if mask.any():
  349. data, fill_value = maybe_upcast(data, copy=True)
  350. data.soften_mask() # set hardmask False if it was True
  351. data[mask] = fill_value
  352. else:
  353. data = data.copy()
  354. mgr = init_ndarray(data, index, columns, dtype=dtype,
  355. copy=copy)
  356. elif isinstance(data, (np.ndarray, Series, Index)):
  357. if data.dtype.names:
  358. data_columns = list(data.dtype.names)
  359. data = {k: data[k] for k in data_columns}
  360. if columns is None:
  361. columns = data_columns
  362. mgr = init_dict(data, index, columns, dtype=dtype)
  363. elif getattr(data, 'name', None) is not None:
  364. mgr = init_dict({data.name: data}, index, columns,
  365. dtype=dtype)
  366. else:
  367. mgr = init_ndarray(data, index, columns, dtype=dtype,
  368. copy=copy)
  369. # For data is list-like, or Iterable (will consume into list)
  370. elif (isinstance(data, compat.Iterable)
  371. and not isinstance(data, string_and_binary_types)):
  372. if not isinstance(data, compat.Sequence):
  373. data = list(data)
  374. if len(data) > 0:
  375. if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
  376. if is_named_tuple(data[0]) and columns is None:
  377. columns = data[0]._fields
  378. arrays, columns = to_arrays(data, columns, dtype=dtype)
  379. columns = ensure_index(columns)
  380. # set the index
  381. if index is None:
  382. if isinstance(data[0], Series):
  383. index = get_names_from_index(data)
  384. elif isinstance(data[0], Categorical):
  385. index = ibase.default_index(len(data[0]))
  386. else:
  387. index = ibase.default_index(len(data))
  388. mgr = arrays_to_mgr(arrays, columns, index, columns,
  389. dtype=dtype)
  390. else:
  391. mgr = init_ndarray(data, index, columns, dtype=dtype,
  392. copy=copy)
  393. else:
  394. mgr = init_dict({}, index, columns, dtype=dtype)
  395. else:
  396. try:
  397. arr = np.array(data, dtype=dtype, copy=copy)
  398. except (ValueError, TypeError) as e:
  399. exc = TypeError('DataFrame constructor called with '
  400. 'incompatible data and dtype: {e}'.format(e=e))
  401. raise_with_traceback(exc)
  402. if arr.ndim == 0 and index is not None and columns is not None:
  403. values = cast_scalar_to_array((len(index), len(columns)),
  404. data, dtype=dtype)
  405. mgr = init_ndarray(values, index, columns,
  406. dtype=values.dtype, copy=False)
  407. else:
  408. raise ValueError('DataFrame constructor not properly called!')
  409. NDFrame.__init__(self, mgr, fastpath=True)
  410. # ----------------------------------------------------------------------
  411. @property
  412. def axes(self):
  413. """
  414. Return a list representing the axes of the DataFrame.
  415. It has the row axis labels and column axis labels as the only members.
  416. They are returned in that order.
  417. Examples
  418. --------
  419. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  420. >>> df.axes
  421. [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
  422. dtype='object')]
  423. """
  424. return [self.index, self.columns]
  425. @property
  426. def shape(self):
  427. """
  428. Return a tuple representing the dimensionality of the DataFrame.
  429. See Also
  430. --------
  431. ndarray.shape
  432. Examples
  433. --------
  434. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  435. >>> df.shape
  436. (2, 2)
  437. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
  438. ... 'col3': [5, 6]})
  439. >>> df.shape
  440. (2, 3)
  441. """
  442. return len(self.index), len(self.columns)
  443. @property
  444. def _is_homogeneous_type(self):
  445. """
  446. Whether all the columns in a DataFrame have the same type.
  447. Returns
  448. -------
  449. bool
  450. Examples
  451. --------
  452. >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
  453. True
  454. >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
  455. False
  456. Items with the same type but different sizes are considered
  457. different types.
  458. >>> DataFrame({
  459. ... "A": np.array([1, 2], dtype=np.int32),
  460. ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
  461. False
  462. """
  463. if self._data.any_extension_types:
  464. return len({block.dtype for block in self._data.blocks}) == 1
  465. else:
  466. return not self._data.is_mixed_type
  467. # ----------------------------------------------------------------------
  468. # Rendering Methods
  469. def _repr_fits_vertical_(self):
  470. """
  471. Check length against max_rows.
  472. """
  473. max_rows = get_option("display.max_rows")
  474. return len(self) <= max_rows
  475. def _repr_fits_horizontal_(self, ignore_width=False):
  476. """
  477. Check if full repr fits in horizontal boundaries imposed by the display
  478. options width and max_columns.
  479. In case off non-interactive session, no boundaries apply.
  480. `ignore_width` is here so ipnb+HTML output can behave the way
  481. users expect. display.max_columns remains in effect.
  482. GH3541, GH3573
  483. """
  484. width, height = console.get_console_size()
  485. max_columns = get_option("display.max_columns")
  486. nb_columns = len(self.columns)
  487. # exceed max columns
  488. if ((max_columns and nb_columns > max_columns) or
  489. ((not ignore_width) and width and nb_columns > (width // 2))):
  490. return False
  491. # used by repr_html under IPython notebook or scripts ignore terminal
  492. # dims
  493. if ignore_width or not console.in_interactive_session():
  494. return True
  495. if (get_option('display.width') is not None or
  496. console.in_ipython_frontend()):
  497. # check at least the column row for excessive width
  498. max_rows = 1
  499. else:
  500. max_rows = get_option("display.max_rows")
  501. # when auto-detecting, so width=None and not in ipython front end
  502. # check whether repr fits horizontal by actually checking
  503. # the width of the rendered repr
  504. buf = StringIO()
  505. # only care about the stuff we'll actually print out
  506. # and to_string on entire frame may be expensive
  507. d = self
  508. if not (max_rows is None): # unlimited rows
  509. # min of two, where one may be None
  510. d = d.iloc[:min(max_rows, len(d))]
  511. else:
  512. return True
  513. d.to_string(buf=buf)
  514. value = buf.getvalue()
  515. repr_width = max(len(l) for l in value.split('\n'))
  516. return repr_width < width
  517. def _info_repr(self):
  518. """
  519. True if the repr should show the info view.
  520. """
  521. info_repr_option = (get_option("display.large_repr") == "info")
  522. return info_repr_option and not (self._repr_fits_horizontal_() and
  523. self._repr_fits_vertical_())
  524. def __unicode__(self):
  525. """
  526. Return a string representation for a particular DataFrame.
  527. Invoked by unicode(df) in py2 only. Yields a Unicode String in both
  528. py2/py3.
  529. """
  530. buf = StringIO(u(""))
  531. if self._info_repr():
  532. self.info(buf=buf)
  533. return buf.getvalue()
  534. max_rows = get_option("display.max_rows")
  535. max_cols = get_option("display.max_columns")
  536. show_dimensions = get_option("display.show_dimensions")
  537. if get_option("display.expand_frame_repr"):
  538. width, _ = console.get_console_size()
  539. else:
  540. width = None
  541. self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
  542. line_width=width, show_dimensions=show_dimensions)
  543. return buf.getvalue()
  544. def _repr_html_(self):
  545. """
  546. Return a html representation for a particular DataFrame.
  547. Mainly for IPython notebook.
  548. """
  549. if self._info_repr():
  550. buf = StringIO(u(""))
  551. self.info(buf=buf)
  552. # need to escape the <class>, should be the first line.
  553. val = buf.getvalue().replace('<', r'&lt;', 1)
  554. val = val.replace('>', r'&gt;', 1)
  555. return '<pre>' + val + '</pre>'
  556. if get_option("display.notebook_repr_html"):
  557. max_rows = get_option("display.max_rows")
  558. max_cols = get_option("display.max_columns")
  559. show_dimensions = get_option("display.show_dimensions")
  560. return self.to_html(max_rows=max_rows, max_cols=max_cols,
  561. show_dimensions=show_dimensions, notebook=True)
  562. else:
  563. return None
  564. @Substitution(header='Write out the column names. If a list of strings '
  565. 'is given, it is assumed to be aliases for the '
  566. 'column names')
  567. @Substitution(shared_params=fmt.common_docstring,
  568. returns=fmt.return_docstring)
  569. def to_string(self, buf=None, columns=None, col_space=None, header=True,
  570. index=True, na_rep='NaN', formatters=None, float_format=None,
  571. sparsify=None, index_names=True, justify=None,
  572. max_rows=None, max_cols=None, show_dimensions=False,
  573. decimal='.', line_width=None):
  574. """
  575. Render a DataFrame to a console-friendly tabular output.
  576. %(shared_params)s
  577. line_width : int, optional
  578. Width to wrap a line in characters.
  579. %(returns)s
  580. See Also
  581. --------
  582. to_html : Convert DataFrame to HTML.
  583. Examples
  584. --------
  585. >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
  586. >>> df = pd.DataFrame(d)
  587. >>> print(df.to_string())
  588. col1 col2
  589. 0 1 4
  590. 1 2 5
  591. 2 3 6
  592. """
  593. formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
  594. col_space=col_space, na_rep=na_rep,
  595. formatters=formatters,
  596. float_format=float_format,
  597. sparsify=sparsify, justify=justify,
  598. index_names=index_names,
  599. header=header, index=index,
  600. max_rows=max_rows,
  601. max_cols=max_cols,
  602. show_dimensions=show_dimensions,
  603. decimal=decimal,
  604. line_width=line_width)
  605. formatter.to_string()
  606. if buf is None:
  607. result = formatter.buf.getvalue()
  608. return result
  609. # ----------------------------------------------------------------------
  610. @property
  611. def style(self):
  612. """
  613. Property returning a Styler object containing methods for
  614. building a styled HTML representation fo the DataFrame.
  615. See Also
  616. --------
  617. io.formats.style.Styler
  618. """
  619. from pandas.io.formats.style import Styler
  620. return Styler(self)
  621. def iteritems(self):
  622. r"""
  623. Iterator over (column name, Series) pairs.
  624. Iterates over the DataFrame columns, returning a tuple with
  625. the column name and the content as a Series.
  626. Yields
  627. ------
  628. label : object
  629. The column names for the DataFrame being iterated over.
  630. content : Series
  631. The column entries belonging to each label, as a Series.
  632. See Also
  633. --------
  634. DataFrame.iterrows : Iterate over DataFrame rows as
  635. (index, Series) pairs.
  636. DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
  637. of the values.
  638. Examples
  639. --------
  640. >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
  641. ... 'population': [1864, 22000, 80000]},
  642. ... index=['panda', 'polar', 'koala'])
  643. >>> df
  644. species population
  645. panda bear 1864
  646. polar bear 22000
  647. koala marsupial 80000
  648. >>> for label, content in df.iteritems():
  649. ... print('label:', label)
  650. ... print('content:', content, sep='\n')
  651. ...
  652. label: species
  653. content:
  654. panda bear
  655. polar bear
  656. koala marsupial
  657. Name: species, dtype: object
  658. label: population
  659. content:
  660. panda 1864
  661. polar 22000
  662. koala 80000
  663. Name: population, dtype: int64
  664. """
  665. if self.columns.is_unique and hasattr(self, '_item_cache'):
  666. for k in self.columns:
  667. yield k, self._get_item_cache(k)
  668. else:
  669. for i, k in enumerate(self.columns):
  670. yield k, self._ixs(i, axis=1)
  671. def iterrows(self):
  672. """
  673. Iterate over DataFrame rows as (index, Series) pairs.
  674. Yields
  675. ------
  676. index : label or tuple of label
  677. The index of the row. A tuple for a `MultiIndex`.
  678. data : Series
  679. The data of the row as a Series.
  680. it : generator
  681. A generator that iterates over the rows of the frame.
  682. See Also
  683. --------
  684. itertuples : Iterate over DataFrame rows as namedtuples of the values.
  685. iteritems : Iterate over (column name, Series) pairs.
  686. Notes
  687. -----
  688. 1. Because ``iterrows`` returns a Series for each row,
  689. it does **not** preserve dtypes across the rows (dtypes are
  690. preserved across columns for DataFrames). For example,
  691. >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
  692. >>> row = next(df.iterrows())[1]
  693. >>> row
  694. int 1.0
  695. float 1.5
  696. Name: 0, dtype: float64
  697. >>> print(row['int'].dtype)
  698. float64
  699. >>> print(df['int'].dtype)
  700. int64
  701. To preserve dtypes while iterating over the rows, it is better
  702. to use :meth:`itertuples` which returns namedtuples of the values
  703. and which is generally faster than ``iterrows``.
  704. 2. You should **never modify** something you are iterating over.
  705. This is not guaranteed to work in all cases. Depending on the
  706. data types, the iterator returns a copy and not a view, and writing
  707. to it will have no effect.
  708. """
  709. columns = self.columns
  710. klass = self._constructor_sliced
  711. for k, v in zip(self.index, self.values):
  712. s = klass(v, index=columns, name=k)
  713. yield k, s
  714. def itertuples(self, index=True, name="Pandas"):
  715. """
  716. Iterate over DataFrame rows as namedtuples.
  717. Parameters
  718. ----------
  719. index : bool, default True
  720. If True, return the index as the first element of the tuple.
  721. name : str or None, default "Pandas"
  722. The name of the returned namedtuples or None to return regular
  723. tuples.
  724. Yields
  725. -------
  726. collections.namedtuple
  727. Yields a namedtuple for each row in the DataFrame with the first
  728. field possibly being the index and following fields being the
  729. column values.
  730. See Also
  731. --------
  732. DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
  733. pairs.
  734. DataFrame.iteritems : Iterate over (column name, Series) pairs.
  735. Notes
  736. -----
  737. The column names will be renamed to positional names if they are
  738. invalid Python identifiers, repeated, or start with an underscore.
  739. With a large number of columns (>255), regular tuples are returned.
  740. Examples
  741. --------
  742. >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
  743. ... index=['dog', 'hawk'])
  744. >>> df
  745. num_legs num_wings
  746. dog 4 0
  747. hawk 2 2
  748. >>> for row in df.itertuples():
  749. ... print(row)
  750. ...
  751. Pandas(Index='dog', num_legs=4, num_wings=0)
  752. Pandas(Index='hawk', num_legs=2, num_wings=2)
  753. By setting the `index` parameter to False we can remove the index
  754. as the first element of the tuple:
  755. >>> for row in df.itertuples(index=False):
  756. ... print(row)
  757. ...
  758. Pandas(num_legs=4, num_wings=0)
  759. Pandas(num_legs=2, num_wings=2)
  760. With the `name` parameter set we set a custom name for the yielded
  761. namedtuples:
  762. >>> for row in df.itertuples(name='Animal'):
  763. ... print(row)
  764. ...
  765. Animal(Index='dog', num_legs=4, num_wings=0)
  766. Animal(Index='hawk', num_legs=2, num_wings=2)
  767. """
  768. arrays = []
  769. fields = list(self.columns)
  770. if index:
  771. arrays.append(self.index)
  772. fields.insert(0, "Index")
  773. # use integer indexing because of possible duplicate column names
  774. arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
  775. # Python 3 supports at most 255 arguments to constructor, and
  776. # things get slow with this many fields in Python 2
  777. if name is not None and len(self.columns) + index < 256:
  778. # `rename` is unsupported in Python 2.6
  779. try:
  780. itertuple = collections.namedtuple(name, fields, rename=True)
  781. return map(itertuple._make, zip(*arrays))
  782. except Exception:
  783. pass
  784. # fallback to regular tuples
  785. return zip(*arrays)
  786. items = iteritems
  787. def __len__(self):
  788. """
  789. Returns length of info axis, but here we use the index.
  790. """
  791. return len(self.index)
  792. def dot(self, other):
  793. """
  794. Compute the matrix mutiplication between the DataFrame and other.
  795. This method computes the matrix product between the DataFrame and the
  796. values of an other Series, DataFrame or a numpy array.
  797. It can also be called using ``self @ other`` in Python >= 3.5.
  798. Parameters
  799. ----------
  800. other : Series, DataFrame or array-like
  801. The other object to compute the matrix product with.
  802. Returns
  803. -------
  804. Series or DataFrame
  805. If other is a Series, return the matrix product between self and
  806. other as a Serie. If other is a DataFrame or a numpy.array, return
  807. the matrix product of self and other in a DataFrame of a np.array.
  808. See Also
  809. --------
  810. Series.dot: Similar method for Series.
  811. Notes
  812. -----
  813. The dimensions of DataFrame and other must be compatible in order to
  814. compute the matrix multiplication.
  815. The dot method for Series computes the inner product, instead of the
  816. matrix product here.
  817. Examples
  818. --------
  819. Here we multiply a DataFrame with a Series.
  820. >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
  821. >>> s = pd.Series([1, 1, 2, 1])
  822. >>> df.dot(s)
  823. 0 -4
  824. 1 5
  825. dtype: int64
  826. Here we multiply a DataFrame with another DataFrame.
  827. >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
  828. >>> df.dot(other)
  829. 0 1
  830. 0 1 4
  831. 1 2 2
  832. Note that the dot method give the same result as @
  833. >>> df @ other
  834. 0 1
  835. 0 1 4
  836. 1 2 2
  837. The dot method works also if other is an np.array.
  838. >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
  839. >>> df.dot(arr)
  840. 0 1
  841. 0 1 4
  842. 1 2 2
  843. """
  844. if isinstance(other, (Series, DataFrame)):
  845. common = self.columns.union(other.index)
  846. if (len(common) > len(self.columns) or
  847. len(common) > len(other.index)):
  848. raise ValueError('matrices are not aligned')
  849. left = self.reindex(columns=common, copy=False)
  850. right = other.reindex(index=common, copy=False)
  851. lvals = left.values
  852. rvals = right.values
  853. else:
  854. left = self
  855. lvals = self.values
  856. rvals = np.asarray(other)
  857. if lvals.shape[1] != rvals.shape[0]:
  858. raise ValueError('Dot product shape mismatch, '
  859. '{s} vs {r}'.format(s=lvals.shape,
  860. r=rvals.shape))
  861. if isinstance(other, DataFrame):
  862. return self._constructor(np.dot(lvals, rvals), index=left.index,
  863. columns=other.columns)
  864. elif isinstance(other, Series):
  865. return Series(np.dot(lvals, rvals), index=left.index)
  866. elif isinstance(rvals, (np.ndarray, Index)):
  867. result = np.dot(lvals, rvals)
  868. if result.ndim == 2:
  869. return self._constructor(result, index=left.index)
  870. else:
  871. return Series(result, index=left.index)
  872. else: # pragma: no cover
  873. raise TypeError('unsupported type: {oth}'.format(oth=type(other)))
  874. def __matmul__(self, other):
  875. """
  876. Matrix multiplication using binary `@` operator in Python>=3.5.
  877. """
  878. return self.dot(other)
  879. def __rmatmul__(self, other):
  880. """
  881. Matrix multiplication using binary `@` operator in Python>=3.5.
  882. """
  883. return self.T.dot(np.transpose(other)).T
  884. # ----------------------------------------------------------------------
  885. # IO methods (to / from other formats)
  886. @classmethod
  887. def from_dict(cls, data, orient='columns', dtype=None, columns=None):
  888. """
  889. Construct DataFrame from dict of array-like or dicts.
  890. Creates DataFrame object from dictionary by columns or by index
  891. allowing dtype specification.
  892. Parameters
  893. ----------
  894. data : dict
  895. Of the form {field : array-like} or {field : dict}.
  896. orient : {'columns', 'index'}, default 'columns'
  897. The "orientation" of the data. If the keys of the passed dict
  898. should be the columns of the resulting DataFrame, pass 'columns'
  899. (default). Otherwise if the keys should be rows, pass 'index'.
  900. dtype : dtype, default None
  901. Data type to force, otherwise infer.
  902. columns : list, default None
  903. Column labels to use when ``orient='index'``. Raises a ValueError
  904. if used with ``orient='columns'``.
  905. .. versionadded:: 0.23.0
  906. Returns
  907. -------
  908. DataFrame
  909. See Also
  910. --------
  911. DataFrame.from_records : DataFrame from ndarray (structured
  912. dtype), list of tuples, dict, or DataFrame.
  913. DataFrame : DataFrame object creation using constructor.
  914. Examples
  915. --------
  916. By default the keys of the dict become the DataFrame columns:
  917. >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
  918. >>> pd.DataFrame.from_dict(data)
  919. col_1 col_2
  920. 0 3 a
  921. 1 2 b
  922. 2 1 c
  923. 3 0 d
  924. Specify ``orient='index'`` to create the DataFrame using dictionary
  925. keys as rows:
  926. >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
  927. >>> pd.DataFrame.from_dict(data, orient='index')
  928. 0 1 2 3
  929. row_1 3 2 1 0
  930. row_2 a b c d
  931. When using the 'index' orientation, the column names can be
  932. specified manually:
  933. >>> pd.DataFrame.from_dict(data, orient='index',
  934. ... columns=['A', 'B', 'C', 'D'])
  935. A B C D
  936. row_1 3 2 1 0
  937. row_2 a b c d
  938. """
  939. index = None
  940. orient = orient.lower()
  941. if orient == 'index':
  942. if len(data) > 0:
  943. # TODO speed up Series case
  944. if isinstance(list(data.values())[0], (Series, dict)):
  945. data = _from_nested_dict(data)
  946. else:
  947. data, index = list(data.values()), list(data.keys())
  948. elif orient == 'columns':
  949. if columns is not None:
  950. raise ValueError("cannot use columns parameter with "
  951. "orient='columns'")
  952. else: # pragma: no cover
  953. raise ValueError('only recognize index or columns for orient')
  954. return cls(data, index=index, columns=columns, dtype=dtype)
  955. def to_numpy(self, dtype=None, copy=False):
  956. """
  957. Convert the DataFrame to a NumPy array.
  958. .. versionadded:: 0.24.0
  959. By default, the dtype of the returned array will be the common NumPy
  960. dtype of all types in the DataFrame. For example, if the dtypes are
  961. ``float16`` and ``float32``, the results dtype will be ``float32``.
  962. This may require copying data and coercing values, which may be
  963. expensive.
  964. Parameters
  965. ----------
  966. dtype : str or numpy.dtype, optional
  967. The dtype to pass to :meth:`numpy.asarray`
  968. copy : bool, default False
  969. Whether to ensure that the returned value is a not a view on
  970. another array. Note that ``copy=False`` does not *ensure* that
  971. ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
  972. a copy is made, even if not strictly necessary.
  973. Returns
  974. -------
  975. numpy.ndarray
  976. See Also
  977. --------
  978. Series.to_numpy : Similar method for Series.
  979. Examples
  980. --------
  981. >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
  982. array([[1, 3],
  983. [2, 4]])
  984. With heterogenous data, the lowest common type will have to
  985. be used.
  986. >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
  987. >>> df.to_numpy()
  988. array([[1. , 3. ],
  989. [2. , 4.5]])
  990. For a mix of numeric and non-numeric types, the output array will
  991. have object dtype.
  992. >>> df['C'] = pd.date_range('2000', periods=2)
  993. >>> df.to_numpy()
  994. array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
  995. [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
  996. """
  997. result = np.array(self.values, dtype=dtype, copy=copy)
  998. return result
  999. def to_dict(self, orient='dict', into=dict):
  1000. """
  1001. Convert the DataFrame to a dictionary.
  1002. The type of the key-value pairs can be customized with the parameters
  1003. (see below).
  1004. Parameters
  1005. ----------
  1006. orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
  1007. Determines the type of the values of the dictionary.
  1008. - 'dict' (default) : dict like {column -> {index -> value}}
  1009. - 'list' : dict like {column -> [values]}
  1010. - 'series' : dict like {column -> Series(values)}
  1011. - 'split' : dict like
  1012. {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
  1013. - 'records' : list like
  1014. [{column -> value}, ... , {column -> value}]
  1015. - 'index' : dict like {index -> {column -> value}}
  1016. Abbreviations are allowed. `s` indicates `series` and `sp`
  1017. indicates `split`.
  1018. into : class, default dict
  1019. The collections.Mapping subclass used for all Mappings
  1020. in the return value. Can be the actual class or an empty
  1021. instance of the mapping type you want. If you want a
  1022. collections.defaultdict, you must pass it initialized.
  1023. .. versionadded:: 0.21.0
  1024. Returns
  1025. -------
  1026. dict, list or collections.Mapping
  1027. Return a collections.Mapping object representing the DataFrame.
  1028. The resulting transformation depends on the `orient` parameter.
  1029. See Also
  1030. --------
  1031. DataFrame.from_dict: Create a DataFrame from a dictionary.
  1032. DataFrame.to_json: Convert a DataFrame to JSON format.
  1033. Examples
  1034. --------
  1035. >>> df = pd.DataFrame({'col1': [1, 2],
  1036. ... 'col2': [0.5, 0.75]},
  1037. ... index=['row1', 'row2'])
  1038. >>> df
  1039. col1 col2
  1040. row1 1 0.50
  1041. row2 2 0.75
  1042. >>> df.to_dict()
  1043. {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
  1044. You can specify the return orientation.
  1045. >>> df.to_dict('series')
  1046. {'col1': row1 1
  1047. row2 2
  1048. Name: col1, dtype: int64,
  1049. 'col2': row1 0.50
  1050. row2 0.75
  1051. Name: col2, dtype: float64}
  1052. >>> df.to_dict('split')
  1053. {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
  1054. 'data': [[1, 0.5], [2, 0.75]]}
  1055. >>> df.to_dict('records')
  1056. [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
  1057. >>> df.to_dict('index')
  1058. {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
  1059. You can also specify the mapping type.
  1060. >>> from collections import OrderedDict, defaultdict
  1061. >>> df.to_dict(into=OrderedDict)
  1062. OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
  1063. ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
  1064. If you want a `defaultdict`, you need to initialize it:
  1065. >>> dd = defaultdict(list)
  1066. >>> df.to_dict('records', into=dd)
  1067. [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
  1068. defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
  1069. """
  1070. if not self.columns.is_unique:
  1071. warnings.warn("DataFrame columns are not unique, some "
  1072. "columns will be omitted.", UserWarning,
  1073. stacklevel=2)
  1074. # GH16122
  1075. into_c = com.standardize_mapping(into)
  1076. if orient.lower().startswith('d'):
  1077. return into_c(
  1078. (k, v.to_dict(into)) for k, v in compat.iteritems(self))
  1079. elif orient.lower().startswith('l'):
  1080. return into_c((k, v.tolist()) for k, v in compat.iteritems(self))
  1081. elif orient.lower().startswith('sp'):
  1082. return into_c((('index', self.index.tolist()),
  1083. ('columns', self.columns.tolist()),
  1084. ('data', [
  1085. list(map(com.maybe_box_datetimelike, t))
  1086. for t in self.itertuples(index=False, name=None)
  1087. ])))
  1088. elif orient.lower().startswith('s'):
  1089. return into_c((k, com.maybe_box_datetimelike(v))
  1090. for k, v in compat.iteritems(self))
  1091. elif orient.lower().startswith('r'):
  1092. columns = self.columns.tolist()
  1093. rows = (dict(zip(columns, row))
  1094. for row in self.itertuples(index=False, name=None))
  1095. return [
  1096. into_c((k, com.maybe_box_datetimelike(v))
  1097. for k, v in compat.iteritems(row))
  1098. for row in rows]
  1099. elif orient.lower().startswith('i'):
  1100. if not self.index.is_unique:
  1101. raise ValueError(
  1102. "DataFrame index must be unique for orient='index'."
  1103. )
  1104. return into_c((t[0], dict(zip(self.columns, t[1:])))
  1105. for t in self.itertuples(name=None))
  1106. else:
  1107. raise ValueError("orient '{o}' not understood".format(o=orient))
  1108. def to_gbq(self, destination_table, project_id=None, chunksize=None,
  1109. reauth=False, if_exists='fail', auth_local_webserver=False,
  1110. table_schema=None, location=None, progress_bar=True,
  1111. credentials=None, verbose=None, private_key=None):
  1112. """
  1113. Write a DataFrame to a Google BigQuery table.
  1114. This function requires the `pandas-gbq package
  1115. <https://pandas-gbq.readthedocs.io>`__.
  1116. See the `How to authenticate with Google BigQuery
  1117. <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
  1118. guide for authentication instructions.
  1119. Parameters
  1120. ----------
  1121. destination_table : str
  1122. Name of table to be written, in the form ``dataset.tablename``.
  1123. project_id : str, optional
  1124. Google BigQuery Account project ID. Optional when available from
  1125. the environment.
  1126. chunksize : int, optional
  1127. Number of rows to be inserted in each chunk from the dataframe.
  1128. Set to ``None`` to load the whole dataframe at once.
  1129. reauth : bool, default False
  1130. Force Google BigQuery to re-authenticate the user. This is useful
  1131. if multiple accounts are used.
  1132. if_exists : str, default 'fail'
  1133. Behavior when the destination table exists. Value can be one of:
  1134. ``'fail'``
  1135. If table exists, do nothing.
  1136. ``'replace'``
  1137. If table exists, drop it, recreate it, and insert data.
  1138. ``'append'``
  1139. If table exists, insert data. Create if does not exist.
  1140. auth_local_webserver : bool, default False
  1141. Use the `local webserver flow`_ instead of the `console flow`_
  1142. when getting user credentials.
  1143. .. _local webserver flow:
  1144. http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
  1145. .. _console flow:
  1146. http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
  1147. *New in version 0.2.0 of pandas-gbq*.
  1148. table_schema : list of dicts, optional
  1149. List of BigQuery table fields to which according DataFrame
  1150. columns conform to, e.g. ``[{'name': 'col1', 'type':
  1151. 'STRING'},...]``. If schema is not provided, it will be
  1152. generated according to dtypes of DataFrame columns. See
  1153. BigQuery API documentation on available names of a field.
  1154. *New in version 0.3.1 of pandas-gbq*.
  1155. location : str, optional
  1156. Location where the load job should run. See the `BigQuery locations
  1157. documentation
  1158. <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
  1159. list of available locations. The location must match that of the
  1160. target dataset.
  1161. *New in version 0.5.0 of pandas-gbq*.
  1162. progress_bar : bool, default True
  1163. Use the library `tqdm` to show the progress bar for the upload,
  1164. chunk by chunk.
  1165. *New in version 0.5.0 of pandas-gbq*.
  1166. credentials : google.auth.credentials.Credentials, optional
  1167. Credentials for accessing Google APIs. Use this parameter to
  1168. override default credentials, such as to use Compute Engine
  1169. :class:`google.auth.compute_engine.Credentials` or Service
  1170. Account :class:`google.oauth2.service_account.Credentials`
  1171. directly.
  1172. *New in version 0.8.0 of pandas-gbq*.
  1173. .. versionadded:: 0.24.0
  1174. verbose : bool, deprecated
  1175. Deprecated in pandas-gbq version 0.4.0. Use the `logging module
  1176. to adjust verbosity instead
  1177. <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
  1178. private_key : str, deprecated
  1179. Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
  1180. parameter and
  1181. :func:`google.oauth2.service_account.Credentials.from_service_account_info`
  1182. or
  1183. :func:`google.oauth2.service_account.Credentials.from_service_account_file`
  1184. instead.
  1185. Service account private key in JSON format. Can be file path
  1186. or string contents. This is useful for remote server
  1187. authentication (eg. Jupyter/IPython notebook on remote host).
  1188. See Also
  1189. --------
  1190. pandas_gbq.to_gbq : This function in the pandas-gbq library.
  1191. read_gbq : Read a DataFrame from Google BigQuery.
  1192. """
  1193. from pandas.io import gbq
  1194. return gbq.to_gbq(
  1195. self, destination_table, project_id=project_id,
  1196. chunksize=chunksize, reauth=reauth, if_exists=if_exists,
  1197. auth_local_webserver=auth_local_webserver,
  1198. table_schema=table_schema, location=location,
  1199. progress_bar=progress_bar, credentials=credentials,
  1200. verbose=verbose, private_key=private_key)
  1201. @classmethod
  1202. def from_records(cls, data, index=None, exclude=None, columns=None,
  1203. coerce_float=False, nrows=None):
  1204. """
  1205. Convert structured or record ndarray to DataFrame.
  1206. Parameters
  1207. ----------
  1208. data : ndarray (structured dtype), list of tuples, dict, or DataFrame
  1209. index : string, list of fields, array-like
  1210. Field of array to use as the index, alternately a specific set of
  1211. input labels to use
  1212. exclude : sequence, default None
  1213. Columns or fields to exclude
  1214. columns : sequence, default None
  1215. Column names to use. If the passed data do not have names
  1216. associated with them, this argument provides names for the
  1217. columns. Otherwise this argument indicates the order of the columns
  1218. in the result (any names not found in the data will become all-NA
  1219. columns)
  1220. coerce_float : boolean, default False
  1221. Attempt to convert values of non-string, non-numeric objects (like
  1222. decimal.Decimal) to floating point, useful for SQL result sets
  1223. nrows : int, default None
  1224. Number of rows to read if data is an iterator
  1225. Returns
  1226. -------
  1227. DataFrame
  1228. """
  1229. # Make a copy of the input columns so we can modify it
  1230. if columns is not None:
  1231. columns = ensure_index(columns)
  1232. if is_iterator(data):
  1233. if nrows == 0:
  1234. return cls()
  1235. try:
  1236. first_row = next(data)
  1237. except StopIteration:
  1238. return cls(index=index, columns=columns)
  1239. dtype = None
  1240. if hasattr(first_row, 'dtype') and first_row.dtype.names:
  1241. dtype = first_row.dtype
  1242. values = [first_row]
  1243. if nrows is None:
  1244. values += data
  1245. else:
  1246. values.extend(itertools.islice(data, nrows - 1))
  1247. if dtype is not None:
  1248. data = np.array(values, dtype=dtype)
  1249. else:
  1250. data = values
  1251. if isinstance(data, dict):
  1252. if columns is None:
  1253. columns = arr_columns = ensure_index(sorted(data))
  1254. arrays = [data[k] for k in columns]
  1255. else:
  1256. arrays = []
  1257. arr_columns = []
  1258. for k, v in compat.iteritems(data):
  1259. if k in columns:
  1260. arr_columns.append(k)
  1261. arrays.append(v)
  1262. arrays, arr_columns = reorder_arrays(arrays, arr_columns,
  1263. columns)
  1264. elif isinstance(data, (np.ndarray, DataFrame)):
  1265. arrays, columns = to_arrays(data, columns)
  1266. if columns is not None:
  1267. columns = ensure_index(columns)
  1268. arr_columns = columns
  1269. else:
  1270. arrays, arr_columns = to_arrays(data, columns,
  1271. coerce_float=coerce_float)
  1272. arr_columns = ensure_index(arr_columns)
  1273. if columns is not None:
  1274. columns = ensure_index(columns)
  1275. else:
  1276. columns = arr_columns
  1277. if exclude is None:
  1278. exclude = set()
  1279. else:
  1280. exclude = set(exclude)
  1281. result_index = None
  1282. if index is not None:
  1283. if (isinstance(index, compat.string_types) or
  1284. not hasattr(index, "__iter__")):
  1285. i = columns.get_loc(index)
  1286. exclude.add(index)
  1287. if len(arrays) > 0:
  1288. result_index = Index(arrays[i], name=index)
  1289. else:
  1290. result_index = Index([], name=index)
  1291. else:
  1292. try:
  1293. index_data = [arrays[arr_columns.get_loc(field)]
  1294. for field in index]
  1295. result_index = ensure_index_from_sequences(index_data,
  1296. names=index)
  1297. exclude.update(index)
  1298. except Exception:
  1299. result_index = index
  1300. if any(exclude):
  1301. arr_exclude = [x for x in exclude if x in arr_columns]
  1302. to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
  1303. arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
  1304. arr_columns = arr_columns.drop(arr_exclude)
  1305. columns = columns.drop(exclude)
  1306. mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
  1307. return cls(mgr)
  1308. def to_records(self, index=True, convert_datetime64=None,
  1309. column_dtypes=None, index_dtypes=None):
  1310. """
  1311. Convert DataFrame to a NumPy record array.
  1312. Index will be included as the first field of the record array if
  1313. requested.
  1314. Parameters
  1315. ----------
  1316. index : bool, default True
  1317. Include index in resulting record array, stored in 'index'
  1318. field or using the index label, if set.
  1319. convert_datetime64 : bool, default None
  1320. .. deprecated:: 0.23.0
  1321. Whether to convert the index to datetime.datetime if it is a
  1322. DatetimeIndex.
  1323. column_dtypes : str, type, dict, default None
  1324. .. versionadded:: 0.24.0
  1325. If a string or type, the data type to store all columns. If
  1326. a dictionary, a mapping of column names and indices (zero-indexed)
  1327. to specific data types.
  1328. index_dtypes : str, type, dict, default None
  1329. .. versionadded:: 0.24.0
  1330. If a string or type, the data type to store all index levels. If
  1331. a dictionary, a mapping of index level names and indices
  1332. (zero-indexed) to specific data types.
  1333. This mapping is applied only if `index=True`.
  1334. Returns
  1335. -------
  1336. numpy.recarray
  1337. NumPy ndarray with the DataFrame labels as fields and each row
  1338. of the DataFrame as entries.
  1339. See Also
  1340. --------
  1341. DataFrame.from_records: Convert structured or record ndarray
  1342. to DataFrame.
  1343. numpy.recarray: An ndarray that allows field access using
  1344. attributes, analogous to typed columns in a
  1345. spreadsheet.
  1346. Examples
  1347. --------
  1348. >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
  1349. ... index=['a', 'b'])
  1350. >>> df
  1351. A B
  1352. a 1 0.50
  1353. b 2 0.75
  1354. >>> df.to_records()
  1355. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  1356. dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
  1357. If the DataFrame index has no label then the recarray field name
  1358. is set to 'index'. If the index has a label then this is used as the
  1359. field name:
  1360. >>> df.index = df.index.rename("I")
  1361. >>> df.to_records()
  1362. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  1363. dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
  1364. The index can be excluded from the record array:
  1365. >>> df.to_records(index=False)
  1366. rec.array([(1, 0.5 ), (2, 0.75)],
  1367. dtype=[('A', '<i8'), ('B', '<f8')])
  1368. Data types can be specified for the columns:
  1369. >>> df.to_records(column_dtypes={"A": "int32"})
  1370. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  1371. dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
  1372. As well as for the index:
  1373. >>> df.to_records(index_dtypes="<S2")
  1374. rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
  1375. dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
  1376. >>> index_dtypes = "<S{}".format(df.index.str.len().max())
  1377. >>> df.to_records(index_dtypes=index_dtypes)
  1378. rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
  1379. dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
  1380. """
  1381. if convert_datetime64 is not None:
  1382. warnings.warn("The 'convert_datetime64' parameter is "
  1383. "deprecated and will be removed in a future "
  1384. "version",
  1385. FutureWarning, stacklevel=2)
  1386. if index:
  1387. if is_datetime64_any_dtype(self.index) and convert_datetime64:
  1388. ix_vals = [self.index.to_pydatetime()]
  1389. else:
  1390. if isinstance(self.index, MultiIndex):
  1391. # array of tuples to numpy cols. copy copy copy
  1392. ix_vals = lmap(np.array, zip(*self.index.values))
  1393. else:
  1394. ix_vals = [self.index.values]
  1395. arrays = ix_vals + [self[c].get_values() for c in self.columns]
  1396. count = 0
  1397. index_names = list(self.index.names)
  1398. if isinstance(self.index, MultiIndex):
  1399. for i, n in enumerate(index_names):
  1400. if n is None:
  1401. index_names[i] = 'level_%d' % count
  1402. count += 1
  1403. elif index_names[0] is None:
  1404. index_names = ['index']
  1405. names = (lmap(compat.text_type, index_names) +
  1406. lmap(compat.text_type, self.columns))
  1407. else:
  1408. arrays = [self[c].get_values() for c in self.columns]
  1409. names = lmap(compat.text_type, self.columns)
  1410. index_names = []
  1411. index_len = len(index_names)
  1412. formats = []
  1413. for i, v in enumerate(arrays):
  1414. index = i
  1415. # When the names and arrays are collected, we
  1416. # first collect those in the DataFrame's index,
  1417. # followed by those in its columns.
  1418. #
  1419. # Thus, the total length of the array is:
  1420. # len(index_names) + len(DataFrame.columns).
  1421. #
  1422. # This check allows us to see whether we are
  1423. # handling a name / array in the index or column.
  1424. if index < index_len:
  1425. dtype_mapping = index_dtypes
  1426. name = index_names[index]
  1427. else:
  1428. index -= index_len
  1429. dtype_mapping = column_dtypes
  1430. name = self.columns[index]
  1431. # We have a dictionary, so we get the data type
  1432. # associated with the index or column (which can
  1433. # be denoted by its name in the DataFrame or its
  1434. # position in DataFrame's array of indices or
  1435. # columns, whichever is applicable.
  1436. if is_dict_like(dtype_mapping):
  1437. if name in dtype_mapping:
  1438. dtype_mapping = dtype_mapping[name]
  1439. elif index in dtype_mapping:
  1440. dtype_mapping = dtype_mapping[index]
  1441. else:
  1442. dtype_mapping = None
  1443. # If no mapping can be found, use the array's
  1444. # dtype attribute for formatting.
  1445. #
  1446. # A valid dtype must either be a type or
  1447. # string naming a type.
  1448. if dtype_mapping is None:
  1449. formats.append(v.dtype)
  1450. elif isinstance(dtype_mapping, (type, np.dtype,
  1451. compat.string_types)):
  1452. formats.append(dtype_mapping)
  1453. else:
  1454. element = "row" if i < index_len else "column"
  1455. msg = ("Invalid dtype {dtype} specified for "
  1456. "{element} {name}").format(dtype=dtype_mapping,
  1457. element=element, name=name)
  1458. raise ValueError(msg)
  1459. return np.rec.fromarrays(
  1460. arrays,
  1461. dtype={'names': names, 'formats': formats}
  1462. )
  1463. @classmethod
  1464. def from_items(cls, items, columns=None, orient='columns'):
  1465. """
  1466. Construct a DataFrame from a list of tuples.
  1467. .. deprecated:: 0.23.0
  1468. `from_items` is deprecated and will be removed in a future version.
  1469. Use :meth:`DataFrame.from_dict(dict(items)) <DataFrame.from_dict>`
  1470. instead.
  1471. :meth:`DataFrame.from_dict(OrderedDict(items)) <DataFrame.from_dict>`
  1472. may be used to preserve the key order.
  1473. Convert (key, value) pairs to DataFrame. The keys will be the axis
  1474. index (usually the columns, but depends on the specified
  1475. orientation). The values should be arrays or Series.
  1476. Parameters
  1477. ----------
  1478. items : sequence of (key, value) pairs
  1479. Values should be arrays or Series.
  1480. columns : sequence of column labels, optional
  1481. Must be passed if orient='index'.
  1482. orient : {'columns', 'index'}, default 'columns'
  1483. The "orientation" of the data. If the keys of the
  1484. input correspond to column labels, pass 'columns'
  1485. (default). Otherwise if the keys correspond to the index,
  1486. pass 'index'.
  1487. Returns
  1488. -------
  1489. DataFrame
  1490. """
  1491. warnings.warn("from_items is deprecated. Please use "
  1492. "DataFrame.from_dict(dict(items), ...) instead. "
  1493. "DataFrame.from_dict(OrderedDict(items)) may be used to "
  1494. "preserve the key order.",
  1495. FutureWarning, stacklevel=2)
  1496. keys, values = lzip(*items)
  1497. if orient == 'columns':
  1498. if columns is not None:
  1499. columns = ensure_index(columns)
  1500. idict = dict(items)
  1501. if len(idict) < len(items):
  1502. if not columns.equals(ensure_index(keys)):
  1503. raise ValueError('With non-unique item names, passed '
  1504. 'columns must be identical')
  1505. arrays = values
  1506. else:
  1507. arrays = [idict[k] for k in columns if k in idict]
  1508. else:
  1509. columns = ensure_index(keys)
  1510. arrays = values
  1511. # GH 17312
  1512. # Provide more informative error msg when scalar values passed
  1513. try:
  1514. return cls._from_arrays(arrays, columns, None)
  1515. except ValueError:
  1516. if not is_nested_list_like(values):
  1517. raise ValueError('The value in each (key, value) pair '
  1518. 'must be an array, Series, or dict')
  1519. elif orient == 'index':
  1520. if columns is None:
  1521. raise TypeError("Must pass columns with orient='index'")
  1522. keys = ensure_index(keys)
  1523. # GH 17312
  1524. # Provide more informative error msg when scalar values passed
  1525. try:
  1526. arr = np.array(values, dtype=object).T
  1527. data = [lib.maybe_convert_objects(v) for v in arr]
  1528. return cls._from_arrays(data, columns, keys)
  1529. except TypeError:
  1530. if not is_nested_list_like(values):
  1531. raise ValueError('The value in each (key, value) pair '
  1532. 'must be an array, Series, or dict')
  1533. else: # pragma: no cover
  1534. raise ValueError("'orient' must be either 'columns' or 'index'")
  1535. @classmethod
  1536. def _from_arrays(cls, arrays, columns, index, dtype=None):
  1537. mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
  1538. return cls(mgr)
  1539. @classmethod
  1540. def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True,
  1541. encoding=None, tupleize_cols=None,
  1542. infer_datetime_format=False):
  1543. """
  1544. Read CSV file.
  1545. .. deprecated:: 0.21.0
  1546. Use :func:`read_csv` instead.
  1547. It is preferable to use the more powerful :func:`read_csv`
  1548. for most general purposes, but ``from_csv`` makes for an easy
  1549. roundtrip to and from a file (the exact counterpart of
  1550. ``to_csv``), especially with a DataFrame of time series data.
  1551. This method only differs from the preferred :func:`read_csv`
  1552. in some defaults:
  1553. - `index_col` is ``0`` instead of ``None`` (take first column as index
  1554. by default)
  1555. - `parse_dates` is ``True`` instead of ``False`` (try parsing the index
  1556. as datetime by default)
  1557. So a ``pd.DataFrame.from_csv(path)`` can be replaced by
  1558. ``pd.read_csv(path, index_col=0, parse_dates=True)``.
  1559. Parameters
  1560. ----------
  1561. path : string file path or file handle / StringIO
  1562. header : int, default 0
  1563. Row to use as header (skip prior rows)
  1564. sep : string, default ','
  1565. Field delimiter
  1566. index_col : int or sequence, default 0
  1567. Column to use for index. If a sequence is given, a MultiIndex
  1568. is used. Different default from read_table
  1569. parse_dates : boolean, default True
  1570. Parse dates. Different default from read_table
  1571. tupleize_cols : boolean, default False
  1572. write multi_index columns as a list of tuples (if True)
  1573. or new (expanded format) if False)
  1574. infer_datetime_format : boolean, default False
  1575. If True and `parse_dates` is True for a column, try to infer the
  1576. datetime format based on the first datetime string. If the format
  1577. can be inferred, there often will be a large parsing speed-up.
  1578. Returns
  1579. -------
  1580. DataFrame
  1581. See Also
  1582. --------
  1583. read_csv
  1584. """
  1585. warnings.warn("from_csv is deprecated. Please use read_csv(...) "
  1586. "instead. Note that some of the default arguments are "
  1587. "different, so please refer to the documentation "
  1588. "for from_csv when changing your function calls",
  1589. FutureWarning, stacklevel=2)
  1590. from pandas.io.parsers import read_csv
  1591. return read_csv(path, header=header, sep=sep,
  1592. parse_dates=parse_dates, index_col=index_col,
  1593. encoding=encoding, tupleize_cols=tupleize_cols,
  1594. infer_datetime_format=infer_datetime_format)
  1595. def to_sparse(self, fill_value=None, kind='block'):
  1596. """
  1597. Convert to SparseDataFrame.
  1598. Implement the sparse version of the DataFrame meaning that any data
  1599. matching a specific value it's omitted in the representation.
  1600. The sparse DataFrame allows for a more efficient storage.
  1601. Parameters
  1602. ----------
  1603. fill_value : float, default None
  1604. The specific value that should be omitted in the representation.
  1605. kind : {'block', 'integer'}, default 'block'
  1606. The kind of the SparseIndex tracking where data is not equal to
  1607. the fill value:
  1608. - 'block' tracks only the locations and sizes of blocks of data.
  1609. - 'integer' keeps an array with all the locations of the data.
  1610. In most cases 'block' is recommended, since it's more memory
  1611. efficient.
  1612. Returns
  1613. -------
  1614. SparseDataFrame
  1615. The sparse representation of the DataFrame.
  1616. See Also
  1617. --------
  1618. DataFrame.to_dense :
  1619. Converts the DataFrame back to the its dense form.
  1620. Examples
  1621. --------
  1622. >>> df = pd.DataFrame([(np.nan, np.nan),
  1623. ... (1., np.nan),
  1624. ... (np.nan, 1.)])
  1625. >>> df
  1626. 0 1
  1627. 0 NaN NaN
  1628. 1 1.0 NaN
  1629. 2 NaN 1.0
  1630. >>> type(df)
  1631. <class 'pandas.core.frame.DataFrame'>
  1632. >>> sdf = df.to_sparse()
  1633. >>> sdf
  1634. 0 1
  1635. 0 NaN NaN
  1636. 1 1.0 NaN
  1637. 2 NaN 1.0
  1638. >>> type(sdf)
  1639. <class 'pandas.core.sparse.frame.SparseDataFrame'>
  1640. """
  1641. from pandas.core.sparse.api import SparseDataFrame
  1642. return SparseDataFrame(self._series, index=self.index,
  1643. columns=self.columns, default_kind=kind,
  1644. default_fill_value=fill_value)
  1645. def to_panel(self):
  1646. """
  1647. Transform long (stacked) format (DataFrame) into wide (3D, Panel)
  1648. format.
  1649. .. deprecated:: 0.20.0
  1650. Currently the index of the DataFrame must be a 2-level MultiIndex. This
  1651. may be generalized later
  1652. Returns
  1653. -------
  1654. Panel
  1655. """
  1656. raise NotImplementedError("Panel is being removed in pandas 0.25.0.")
  1657. @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
  1658. def to_stata(self, fname, convert_dates=None, write_index=True,
  1659. encoding="latin-1", byteorder=None, time_stamp=None,
  1660. data_label=None, variable_labels=None, version=114,
  1661. convert_strl=None):
  1662. """
  1663. Export DataFrame object to Stata dta format.
  1664. Writes the DataFrame to a Stata dataset file.
  1665. "dta" files contain a Stata dataset.
  1666. Parameters
  1667. ----------
  1668. fname : str, buffer or path object
  1669. String, path object (pathlib.Path or py._path.local.LocalPath) or
  1670. object implementing a binary write() function. If using a buffer
  1671. then the buffer will not be automatically closed after the file
  1672. data has been written.
  1673. convert_dates : dict
  1674. Dictionary mapping columns containing datetime types to stata
  1675. internal format to use when writing the dates. Options are 'tc',
  1676. 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
  1677. or a name. Datetime columns that do not have a conversion type
  1678. specified will be converted to 'tc'. Raises NotImplementedError if
  1679. a datetime column has timezone information.
  1680. write_index : bool
  1681. Write the index to Stata dataset.
  1682. encoding : str
  1683. Default is latin-1. Unicode is not supported.
  1684. byteorder : str
  1685. Can be ">", "<", "little", or "big". default is `sys.byteorder`.
  1686. time_stamp : datetime
  1687. A datetime to use as file creation date. Default is the current
  1688. time.
  1689. data_label : str, optional
  1690. A label for the data set. Must be 80 characters or smaller.
  1691. variable_labels : dict
  1692. Dictionary containing columns as keys and variable labels as
  1693. values. Each label must be 80 characters or smaller.
  1694. .. versionadded:: 0.19.0
  1695. version : {114, 117}, default 114
  1696. Version to use in the output dta file. Version 114 can be used
  1697. read by Stata 10 and later. Version 117 can be read by Stata 13
  1698. or later. Version 114 limits string variables to 244 characters or
  1699. fewer while 117 allows strings with lengths up to 2,000,000
  1700. characters.
  1701. .. versionadded:: 0.23.0
  1702. convert_strl : list, optional
  1703. List of column names to convert to string columns to Stata StrL
  1704. format. Only available if version is 117. Storing strings in the
  1705. StrL format can produce smaller dta files if strings have more than
  1706. 8 characters and values are repeated.
  1707. .. versionadded:: 0.23.0
  1708. Raises
  1709. ------
  1710. NotImplementedError
  1711. * If datetimes contain timezone information
  1712. * Column dtype is not representable in Stata
  1713. ValueError
  1714. * Columns listed in convert_dates are neither datetime64[ns]
  1715. or datetime.datetime
  1716. * Column listed in convert_dates is not in DataFrame
  1717. * Categorical label contains more than 32,000 characters
  1718. .. versionadded:: 0.19.0
  1719. See Also
  1720. --------
  1721. read_stata : Import Stata data files.
  1722. io.stata.StataWriter : Low-level writer for Stata data files.
  1723. io.stata.StataWriter117 : Low-level writer for version 117 files.
  1724. Examples
  1725. --------
  1726. >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon',
  1727. ... 'parrot'],
  1728. ... 'speed': [350, 18, 361, 15]})
  1729. >>> df.to_stata('animals.dta') # doctest: +SKIP
  1730. """
  1731. kwargs = {}
  1732. if version not in (114, 117):
  1733. raise ValueError('Only formats 114 and 117 supported.')
  1734. if version == 114:
  1735. if convert_strl is not None:
  1736. raise ValueError('strl support is only available when using '
  1737. 'format 117')
  1738. from pandas.io.stata import StataWriter as statawriter
  1739. else:
  1740. from pandas.io.stata import StataWriter117 as statawriter
  1741. kwargs['convert_strl'] = convert_strl
  1742. writer = statawriter(fname, self, convert_dates=convert_dates,
  1743. byteorder=byteorder, time_stamp=time_stamp,
  1744. data_label=data_label, write_index=write_index,
  1745. variable_labels=variable_labels, **kwargs)
  1746. writer.write_file()
  1747. def to_feather(self, fname):
  1748. """
  1749. Write out the binary feather-format for DataFrames.
  1750. .. versionadded:: 0.20.0
  1751. Parameters
  1752. ----------
  1753. fname : str
  1754. string file path
  1755. """
  1756. from pandas.io.feather_format import to_feather
  1757. to_feather(self, fname)
  1758. def to_parquet(self, fname, engine='auto', compression='snappy',
  1759. index=None, partition_cols=None, **kwargs):
  1760. """
  1761. Write a DataFrame to the binary parquet format.
  1762. .. versionadded:: 0.21.0
  1763. This function writes the dataframe as a `parquet file
  1764. <https://parquet.apache.org/>`_. You can choose different parquet
  1765. backends, and have the option of compression. See
  1766. :ref:`the user guide <io.parquet>` for more details.
  1767. Parameters
  1768. ----------
  1769. fname : str
  1770. File path or Root Directory path. Will be used as Root Directory
  1771. path while writing a partitioned dataset.
  1772. .. versionchanged:: 0.24.0
  1773. engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
  1774. Parquet library to use. If 'auto', then the option
  1775. ``io.parquet.engine`` is used. The default ``io.parquet.engine``
  1776. behavior is to try 'pyarrow', falling back to 'fastparquet' if
  1777. 'pyarrow' is unavailable.
  1778. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
  1779. Name of the compression to use. Use ``None`` for no compression.
  1780. index : bool, default None
  1781. If ``True``, include the dataframe's index(es) in the file output.
  1782. If ``False``, they will not be written to the file. If ``None``,
  1783. the behavior depends on the chosen engine.
  1784. .. versionadded:: 0.24.0
  1785. partition_cols : list, optional, default None
  1786. Column names by which to partition the dataset
  1787. Columns are partitioned in the order they are given
  1788. .. versionadded:: 0.24.0
  1789. **kwargs
  1790. Additional arguments passed to the parquet library. See
  1791. :ref:`pandas io <io.parquet>` for more details.
  1792. See Also
  1793. --------
  1794. read_parquet : Read a parquet file.
  1795. DataFrame.to_csv : Write a csv file.
  1796. DataFrame.to_sql : Write to a sql table.
  1797. DataFrame.to_hdf : Write to hdf.
  1798. Notes
  1799. -----
  1800. This function requires either the `fastparquet
  1801. <https://pypi.org/project/fastparquet>`_ or `pyarrow
  1802. <https://arrow.apache.org/docs/python/>`_ library.
  1803. Examples
  1804. --------
  1805. >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
  1806. >>> df.to_parquet('df.parquet.gzip',
  1807. ... compression='gzip') # doctest: +SKIP
  1808. >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
  1809. col1 col2
  1810. 0 1 3
  1811. 1 2 4
  1812. """
  1813. from pandas.io.parquet import to_parquet
  1814. to_parquet(self, fname, engine,
  1815. compression=compression, index=index,
  1816. partition_cols=partition_cols, **kwargs)
  1817. @Substitution(header='Whether to print column labels, default True')
  1818. @Substitution(shared_params=fmt.common_docstring,
  1819. returns=fmt.return_docstring)
  1820. def to_html(self, buf=None, columns=None, col_space=None, header=True,
  1821. index=True, na_rep='NaN', formatters=None, float_format=None,
  1822. sparsify=None, index_names=True, justify=None, max_rows=None,
  1823. max_cols=None, show_dimensions=False, decimal='.',
  1824. bold_rows=True, classes=None, escape=True, notebook=False,
  1825. border=None, table_id=None, render_links=False):
  1826. """
  1827. Render a DataFrame as an HTML table.
  1828. %(shared_params)s
  1829. bold_rows : bool, default True
  1830. Make the row labels bold in the output.
  1831. classes : str or list or tuple, default None
  1832. CSS class(es) to apply to the resulting html table.
  1833. escape : bool, default True
  1834. Convert the characters <, >, and & to HTML-safe sequences.
  1835. notebook : {True, False}, default False
  1836. Whether the generated HTML is for IPython Notebook.
  1837. border : int
  1838. A ``border=border`` attribute is included in the opening
  1839. `<table>` tag. Default ``pd.options.html.border``.
  1840. .. versionadded:: 0.19.0
  1841. table_id : str, optional
  1842. A css id is included in the opening `<table>` tag if specified.
  1843. .. versionadded:: 0.23.0
  1844. render_links : bool, default False
  1845. Convert URLs to HTML links.
  1846. .. versionadded:: 0.24.0
  1847. %(returns)s
  1848. See Also
  1849. --------
  1850. to_string : Convert DataFrame to a string.
  1851. """
  1852. if (justify is not None and
  1853. justify not in fmt._VALID_JUSTIFY_PARAMETERS):
  1854. raise ValueError("Invalid value for justify parameter")
  1855. formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
  1856. col_space=col_space, na_rep=na_rep,
  1857. formatters=formatters,
  1858. float_format=float_format,
  1859. sparsify=sparsify, justify=justify,
  1860. index_names=index_names,
  1861. header=header, index=index,
  1862. bold_rows=bold_rows, escape=escape,
  1863. max_rows=max_rows,
  1864. max_cols=max_cols,
  1865. show_dimensions=show_dimensions,
  1866. decimal=decimal, table_id=table_id,
  1867. render_links=render_links)
  1868. # TODO: a generic formatter wld b in DataFrameFormatter
  1869. formatter.to_html(classes=classes, notebook=notebook, border=border)
  1870. if buf is None:
  1871. return formatter.buf.getvalue()
  1872. # ----------------------------------------------------------------------
  1873. def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
  1874. null_counts=None):
  1875. """
  1876. Print a concise summary of a DataFrame.
  1877. This method prints information about a DataFrame including
  1878. the index dtype and column dtypes, non-null values and memory usage.
  1879. Parameters
  1880. ----------
  1881. verbose : bool, optional
  1882. Whether to print the full summary. By default, the setting in
  1883. ``pandas.options.display.max_info_columns`` is followed.
  1884. buf : writable buffer, defaults to sys.stdout
  1885. Where to send the output. By default, the output is printed to
  1886. sys.stdout. Pass a writable buffer if you need to further process
  1887. the output.
  1888. max_cols : int, optional
  1889. When to switch from the verbose to the truncated output. If the
  1890. DataFrame has more than `max_cols` columns, the truncated output
  1891. is used. By default, the setting in
  1892. ``pandas.options.display.max_info_columns`` is used.
  1893. memory_usage : bool, str, optional
  1894. Specifies whether total memory usage of the DataFrame
  1895. elements (including the index) should be displayed. By default,
  1896. this follows the ``pandas.options.display.memory_usage`` setting.
  1897. True always show memory usage. False never shows memory usage.
  1898. A value of 'deep' is equivalent to "True with deep introspection".
  1899. Memory usage is shown in human-readable units (base-2
  1900. representation). Without deep introspection a memory estimation is
  1901. made based in column dtype and number of rows assuming values
  1902. consume the same memory amount for corresponding dtypes. With deep
  1903. memory introspection, a real memory usage calculation is performed
  1904. at the cost of computational resources.
  1905. null_counts : bool, optional
  1906. Whether to show the non-null counts. By default, this is shown
  1907. only if the frame is smaller than
  1908. ``pandas.options.display.max_info_rows`` and
  1909. ``pandas.options.display.max_info_columns``. A value of True always
  1910. shows the counts, and False never shows the counts.
  1911. Returns
  1912. -------
  1913. None
  1914. This method prints a summary of a DataFrame and returns None.
  1915. See Also
  1916. --------
  1917. DataFrame.describe: Generate descriptive statistics of DataFrame
  1918. columns.
  1919. DataFrame.memory_usage: Memory usage of DataFrame columns.
  1920. Examples
  1921. --------
  1922. >>> int_values = [1, 2, 3, 4, 5]
  1923. >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
  1924. >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
  1925. >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
  1926. ... "float_col": float_values})
  1927. >>> df
  1928. int_col text_col float_col
  1929. 0 1 alpha 0.00
  1930. 1 2 beta 0.25
  1931. 2 3 gamma 0.50
  1932. 3 4 delta 0.75
  1933. 4 5 epsilon 1.00
  1934. Prints information of all columns:
  1935. >>> df.info(verbose=True)
  1936. <class 'pandas.core.frame.DataFrame'>
  1937. RangeIndex: 5 entries, 0 to 4
  1938. Data columns (total 3 columns):
  1939. int_col 5 non-null int64
  1940. text_col 5 non-null object
  1941. float_col 5 non-null float64
  1942. dtypes: float64(1), int64(1), object(1)
  1943. memory usage: 200.0+ bytes
  1944. Prints a summary of columns count and its dtypes but not per column
  1945. information:
  1946. >>> df.info(verbose=False)
  1947. <class 'pandas.core.frame.DataFrame'>
  1948. RangeIndex: 5 entries, 0 to 4
  1949. Columns: 3 entries, int_col to float_col
  1950. dtypes: float64(1), int64(1), object(1)
  1951. memory usage: 200.0+ bytes
  1952. Pipe output of DataFrame.info to buffer instead of sys.stdout, get
  1953. buffer content and writes to a text file:
  1954. >>> import io
  1955. >>> buffer = io.StringIO()
  1956. >>> df.info(buf=buffer)
  1957. >>> s = buffer.getvalue()
  1958. >>> with open("df_info.txt", "w",
  1959. ... encoding="utf-8") as f: # doctest: +SKIP
  1960. ... f.write(s)
  1961. 260
  1962. The `memory_usage` parameter allows deep introspection mode, specially
  1963. useful for big DataFrames and fine-tune memory optimization:
  1964. >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
  1965. >>> df = pd.DataFrame({
  1966. ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
  1967. ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
  1968. ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
  1969. ... })
  1970. >>> df.info()
  1971. <class 'pandas.core.frame.DataFrame'>
  1972. RangeIndex: 1000000 entries, 0 to 999999
  1973. Data columns (total 3 columns):
  1974. column_1 1000000 non-null object
  1975. column_2 1000000 non-null object
  1976. column_3 1000000 non-null object
  1977. dtypes: object(3)
  1978. memory usage: 22.9+ MB
  1979. >>> df.info(memory_usage='deep')
  1980. <class 'pandas.core.frame.DataFrame'>
  1981. RangeIndex: 1000000 entries, 0 to 999999
  1982. Data columns (total 3 columns):
  1983. column_1 1000000 non-null object
  1984. column_2 1000000 non-null object
  1985. column_3 1000000 non-null object
  1986. dtypes: object(3)
  1987. memory usage: 188.8 MB
  1988. """
  1989. if buf is None: # pragma: no cover
  1990. buf = sys.stdout
  1991. lines = []
  1992. lines.append(str(type(self)))
  1993. lines.append(self.index._summary())
  1994. if len(self.columns) == 0:
  1995. lines.append('Empty {name}'.format(name=type(self).__name__))
  1996. fmt.buffer_put_lines(buf, lines)
  1997. return
  1998. cols = self.columns
  1999. # hack
  2000. if max_cols is None:
  2001. max_cols = get_option('display.max_info_columns',
  2002. len(self.columns) + 1)
  2003. max_rows = get_option('display.max_info_rows', len(self) + 1)
  2004. if null_counts is None:
  2005. show_counts = ((len(self.columns) <= max_cols) and
  2006. (len(self) < max_rows))
  2007. else:
  2008. show_counts = null_counts
  2009. exceeds_info_cols = len(self.columns) > max_cols
  2010. def _verbose_repr():
  2011. lines.append('Data columns (total %d columns):' %
  2012. len(self.columns))
  2013. space = max(len(pprint_thing(k)) for k in self.columns) + 4
  2014. counts = None
  2015. tmpl = "{count}{dtype}"
  2016. if show_counts:
  2017. counts = self.count()
  2018. if len(cols) != len(counts): # pragma: no cover
  2019. raise AssertionError(
  2020. 'Columns must equal counts '
  2021. '({cols:d} != {counts:d})'.format(
  2022. cols=len(cols), counts=len(counts)))
  2023. tmpl = "{count} non-null {dtype}"
  2024. dtypes = self.dtypes
  2025. for i, col in enumerate(self.columns):
  2026. dtype = dtypes.iloc[i]
  2027. col = pprint_thing(col)
  2028. count = ""
  2029. if show_counts:
  2030. count = counts.iloc[i]
  2031. lines.append(_put_str(col, space) + tmpl.format(count=count,
  2032. dtype=dtype))
  2033. def _non_verbose_repr():
  2034. lines.append(self.columns._summary(name='Columns'))
  2035. def _sizeof_fmt(num, size_qualifier):
  2036. # returns size in human readable format
  2037. for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
  2038. if num < 1024.0:
  2039. return ("{num:3.1f}{size_q} "
  2040. "{x}".format(num=num, size_q=size_qualifier, x=x))
  2041. num /= 1024.0
  2042. return "{num:3.1f}{size_q} {pb}".format(num=num,
  2043. size_q=size_qualifier,
  2044. pb='PB')
  2045. if verbose:
  2046. _verbose_repr()
  2047. elif verbose is False: # specifically set to False, not nesc None
  2048. _non_verbose_repr()
  2049. else:
  2050. if exceeds_info_cols:
  2051. _non_verbose_repr()
  2052. else:
  2053. _verbose_repr()
  2054. counts = self.get_dtype_counts()
  2055. dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
  2056. in sorted(compat.iteritems(counts))]
  2057. lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
  2058. if memory_usage is None:
  2059. memory_usage = get_option('display.memory_usage')
  2060. if memory_usage:
  2061. # append memory usage of df to display
  2062. size_qualifier = ''
  2063. if memory_usage == 'deep':
  2064. deep = True
  2065. else:
  2066. # size_qualifier is just a best effort; not guaranteed to catch
  2067. # all cases (e.g., it misses categorical data even with object
  2068. # categories)
  2069. deep = False
  2070. if ('object' in counts or
  2071. self.index._is_memory_usage_qualified()):
  2072. size_qualifier = '+'
  2073. mem_usage = self.memory_usage(index=True, deep=deep).sum()
  2074. lines.append("memory usage: {mem}\n".format(
  2075. mem=_sizeof_fmt(mem_usage, size_qualifier)))
  2076. fmt.buffer_put_lines(buf, lines)
  2077. def memory_usage(self, index=True, deep=False):
  2078. """
  2079. Return the memory usage of each column in bytes.
  2080. The memory usage can optionally include the contribution of
  2081. the index and elements of `object` dtype.
  2082. This value is displayed in `DataFrame.info` by default. This can be
  2083. suppressed by setting ``pandas.options.display.memory_usage`` to False.
  2084. Parameters
  2085. ----------
  2086. index : bool, default True
  2087. Specifies whether to include the memory usage of the DataFrame's
  2088. index in returned Series. If ``index=True`` the memory usage of the
  2089. index the first item in the output.
  2090. deep : bool, default False
  2091. If True, introspect the data deeply by interrogating
  2092. `object` dtypes for system-level memory consumption, and include
  2093. it in the returned values.
  2094. Returns
  2095. -------
  2096. Series
  2097. A Series whose index is the original column names and whose values
  2098. is the memory usage of each column in bytes.
  2099. See Also
  2100. --------
  2101. numpy.ndarray.nbytes : Total bytes consumed by the elements of an
  2102. ndarray.
  2103. Series.memory_usage : Bytes consumed by a Series.
  2104. Categorical : Memory-efficient array for string values with
  2105. many repeated values.
  2106. DataFrame.info : Concise summary of a DataFrame.
  2107. Examples
  2108. --------
  2109. >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
  2110. >>> data = dict([(t, np.ones(shape=5000).astype(t))
  2111. ... for t in dtypes])
  2112. >>> df = pd.DataFrame(data)
  2113. >>> df.head()
  2114. int64 float64 complex128 object bool
  2115. 0 1 1.0 (1+0j) 1 True
  2116. 1 1 1.0 (1+0j) 1 True
  2117. 2 1 1.0 (1+0j) 1 True
  2118. 3 1 1.0 (1+0j) 1 True
  2119. 4 1 1.0 (1+0j) 1 True
  2120. >>> df.memory_usage()
  2121. Index 80
  2122. int64 40000
  2123. float64 40000
  2124. complex128 80000
  2125. object 40000
  2126. bool 5000
  2127. dtype: int64
  2128. >>> df.memory_usage(index=False)
  2129. int64 40000
  2130. float64 40000
  2131. complex128 80000
  2132. object 40000
  2133. bool 5000
  2134. dtype: int64
  2135. The memory footprint of `object` dtype columns is ignored by default:
  2136. >>> df.memory_usage(deep=True)
  2137. Index 80
  2138. int64 40000
  2139. float64 40000
  2140. complex128 80000
  2141. object 160000
  2142. bool 5000
  2143. dtype: int64
  2144. Use a Categorical for efficient storage of an object-dtype column with
  2145. many repeated values.
  2146. >>> df['object'].astype('category').memory_usage(deep=True)
  2147. 5168
  2148. """
  2149. result = Series([c.memory_usage(index=False, deep=deep)
  2150. for col, c in self.iteritems()], index=self.columns)
  2151. if index:
  2152. result = Series(self.index.memory_usage(deep=deep),
  2153. index=['Index']).append(result)
  2154. return result
  2155. def transpose(self, *args, **kwargs):
  2156. """
  2157. Transpose index and columns.
  2158. Reflect the DataFrame over its main diagonal by writing rows as columns
  2159. and vice-versa. The property :attr:`.T` is an accessor to the method
  2160. :meth:`transpose`.
  2161. Parameters
  2162. ----------
  2163. copy : bool, default False
  2164. If True, the underlying data is copied. Otherwise (default), no
  2165. copy is made if possible.
  2166. *args, **kwargs
  2167. Additional keywords have no effect but might be accepted for
  2168. compatibility with numpy.
  2169. Returns
  2170. -------
  2171. DataFrame
  2172. The transposed DataFrame.
  2173. See Also
  2174. --------
  2175. numpy.transpose : Permute the dimensions of a given array.
  2176. Notes
  2177. -----
  2178. Transposing a DataFrame with mixed dtypes will result in a homogeneous
  2179. DataFrame with the `object` dtype. In such a case, a copy of the data
  2180. is always made.
  2181. Examples
  2182. --------
  2183. **Square DataFrame with homogeneous dtype**
  2184. >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
  2185. >>> df1 = pd.DataFrame(data=d1)
  2186. >>> df1
  2187. col1 col2
  2188. 0 1 3
  2189. 1 2 4
  2190. >>> df1_transposed = df1.T # or df1.transpose()
  2191. >>> df1_transposed
  2192. 0 1
  2193. col1 1 2
  2194. col2 3 4
  2195. When the dtype is homogeneous in the original DataFrame, we get a
  2196. transposed DataFrame with the same dtype:
  2197. >>> df1.dtypes
  2198. col1 int64
  2199. col2 int64
  2200. dtype: object
  2201. >>> df1_transposed.dtypes
  2202. 0 int64
  2203. 1 int64
  2204. dtype: object
  2205. **Non-square DataFrame with mixed dtypes**
  2206. >>> d2 = {'name': ['Alice', 'Bob'],
  2207. ... 'score': [9.5, 8],
  2208. ... 'employed': [False, True],
  2209. ... 'kids': [0, 0]}
  2210. >>> df2 = pd.DataFrame(data=d2)
  2211. >>> df2
  2212. name score employed kids
  2213. 0 Alice 9.5 False 0
  2214. 1 Bob 8.0 True 0
  2215. >>> df2_transposed = df2.T # or df2.transpose()
  2216. >>> df2_transposed
  2217. 0 1
  2218. name Alice Bob
  2219. score 9.5 8
  2220. employed False True
  2221. kids 0 0
  2222. When the DataFrame has mixed dtypes, we get a transposed DataFrame with
  2223. the `object` dtype:
  2224. >>> df2.dtypes
  2225. name object
  2226. score float64
  2227. employed bool
  2228. kids int64
  2229. dtype: object
  2230. >>> df2_transposed.dtypes
  2231. 0 object
  2232. 1 object
  2233. dtype: object
  2234. """
  2235. nv.validate_transpose(args, dict())
  2236. return super(DataFrame, self).transpose(1, 0, **kwargs)
  2237. T = property(transpose)
  2238. # ----------------------------------------------------------------------
  2239. # Picklability
  2240. # legacy pickle formats
  2241. def _unpickle_frame_compat(self, state): # pragma: no cover
  2242. if len(state) == 2: # pragma: no cover
  2243. series, idx = state
  2244. columns = sorted(series)
  2245. else:
  2246. series, cols, idx = state
  2247. columns = com._unpickle_array(cols)
  2248. index = com._unpickle_array(idx)
  2249. self._data = self._init_dict(series, index, columns, None)
  2250. def _unpickle_matrix_compat(self, state): # pragma: no cover
  2251. # old unpickling
  2252. (vals, idx, cols), object_state = state
  2253. index = com._unpickle_array(idx)
  2254. dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols),
  2255. copy=False)
  2256. if object_state is not None:
  2257. ovals, _, ocols = object_state
  2258. objects = DataFrame(ovals, index=index,
  2259. columns=com._unpickle_array(ocols), copy=False)
  2260. dm = dm.join(objects)
  2261. self._data = dm._data
  2262. # ----------------------------------------------------------------------
  2263. # Getting and setting elements
  2264. def get_value(self, index, col, takeable=False):
  2265. """
  2266. Quickly retrieve single value at passed column and index.
  2267. .. deprecated:: 0.21.0
  2268. Use .at[] or .iat[] accessors instead.
  2269. Parameters
  2270. ----------
  2271. index : row label
  2272. col : column label
  2273. takeable : interpret the index/col as indexers, default False
  2274. Returns
  2275. -------
  2276. scalar
  2277. """
  2278. warnings.warn("get_value is deprecated and will be removed "
  2279. "in a future release. Please use "
  2280. ".at[] or .iat[] accessors instead", FutureWarning,
  2281. stacklevel=2)
  2282. return self._get_value(index, col, takeable=takeable)
  2283. def _get_value(self, index, col, takeable=False):
  2284. if takeable:
  2285. series = self._iget_item_cache(col)
  2286. return com.maybe_box_datetimelike(series._values[index])
  2287. series = self._get_item_cache(col)
  2288. engine = self.index._engine
  2289. try:
  2290. return engine.get_value(series._values, index)
  2291. except (TypeError, ValueError):
  2292. # we cannot handle direct indexing
  2293. # use positional
  2294. col = self.columns.get_loc(col)
  2295. index = self.index.get_loc(index)
  2296. return self._get_value(index, col, takeable=True)
  2297. _get_value.__doc__ = get_value.__doc__
  2298. def set_value(self, index, col, value, takeable=False):
  2299. """
  2300. Put single value at passed column and index.
  2301. .. deprecated:: 0.21.0
  2302. Use .at[] or .iat[] accessors instead.
  2303. Parameters
  2304. ----------
  2305. index : row label
  2306. col : column label
  2307. value : scalar
  2308. takeable : interpret the index/col as indexers, default False
  2309. Returns
  2310. -------
  2311. DataFrame
  2312. If label pair is contained, will be reference to calling DataFrame,
  2313. otherwise a new object.
  2314. """
  2315. warnings.warn("set_value is deprecated and will be removed "
  2316. "in a future release. Please use "
  2317. ".at[] or .iat[] accessors instead", FutureWarning,
  2318. stacklevel=2)
  2319. return self._set_value(index, col, value, takeable=takeable)
  2320. def _set_value(self, index, col, value, takeable=False):
  2321. try:
  2322. if takeable is True:
  2323. series = self._iget_item_cache(col)
  2324. return series._set_value(index, value, takeable=True)
  2325. series = self._get_item_cache(col)
  2326. engine = self.index._engine
  2327. engine.set_value(series._values, index, value)
  2328. return self
  2329. except (KeyError, TypeError):
  2330. # set using a non-recursive method & reset the cache
  2331. if takeable:
  2332. self.iloc[index, col] = value
  2333. else:
  2334. self.loc[index, col] = value
  2335. self._item_cache.pop(col, None)
  2336. return self
  2337. _set_value.__doc__ = set_value.__doc__
  2338. def _ixs(self, i, axis=0):
  2339. """
  2340. Parameters
  2341. ----------
  2342. i : int, slice, or sequence of integers
  2343. axis : int
  2344. Notes
  2345. -----
  2346. If slice passed, the resulting data will be a view.
  2347. """
  2348. # irow
  2349. if axis == 0:
  2350. if isinstance(i, slice):
  2351. return self[i]
  2352. else:
  2353. label = self.index[i]
  2354. if isinstance(label, Index):
  2355. # a location index by definition
  2356. result = self.take(i, axis=axis)
  2357. copy = True
  2358. else:
  2359. new_values = self._data.fast_xs(i)
  2360. if is_scalar(new_values):
  2361. return new_values
  2362. # if we are a copy, mark as such
  2363. copy = (isinstance(new_values, np.ndarray) and
  2364. new_values.base is None)
  2365. result = self._constructor_sliced(new_values,
  2366. index=self.columns,
  2367. name=self.index[i],
  2368. dtype=new_values.dtype)
  2369. result._set_is_copy(self, copy=copy)
  2370. return result
  2371. # icol
  2372. else:
  2373. label = self.columns[i]
  2374. if isinstance(i, slice):
  2375. # need to return view
  2376. lab_slice = slice(label[0], label[-1])
  2377. return self.loc[:, lab_slice]
  2378. else:
  2379. if isinstance(label, Index):
  2380. return self._take(i, axis=1)
  2381. index_len = len(self.index)
  2382. # if the values returned are not the same length
  2383. # as the index (iow a not found value), iget returns
  2384. # a 0-len ndarray. This is effectively catching
  2385. # a numpy error (as numpy should really raise)
  2386. values = self._data.iget(i)
  2387. if index_len and not len(values):
  2388. values = np.array([np.nan] * index_len, dtype=object)
  2389. result = self._box_col_values(values, label)
  2390. # this is a cached value, mark it so
  2391. result._set_as_cached(label, self)
  2392. return result
  2393. def __getitem__(self, key):
  2394. key = lib.item_from_zerodim(key)
  2395. key = com.apply_if_callable(key, self)
  2396. # shortcut if the key is in columns
  2397. try:
  2398. if self.columns.is_unique and key in self.columns:
  2399. if self.columns.nlevels > 1:
  2400. return self._getitem_multilevel(key)
  2401. return self._get_item_cache(key)
  2402. except (TypeError, ValueError):
  2403. # The TypeError correctly catches non hashable "key" (e.g. list)
  2404. # The ValueError can be removed once GH #21729 is fixed
  2405. pass
  2406. # Do we have a slicer (on rows)?
  2407. indexer = convert_to_index_sliceable(self, key)
  2408. if indexer is not None:
  2409. return self._slice(indexer, axis=0)
  2410. # Do we have a (boolean) DataFrame?
  2411. if isinstance(key, DataFrame):
  2412. return self._getitem_frame(key)
  2413. # Do we have a (boolean) 1d indexer?
  2414. if com.is_bool_indexer(key):
  2415. return self._getitem_bool_array(key)
  2416. # We are left with two options: a single key, and a collection of keys,
  2417. # We interpret tuples as collections only for non-MultiIndex
  2418. is_single_key = isinstance(key, tuple) or not is_list_like(key)
  2419. if is_single_key:
  2420. if self.columns.nlevels > 1:
  2421. return self._getitem_multilevel(key)
  2422. indexer = self.columns.get_loc(key)
  2423. if is_integer(indexer):
  2424. indexer = [indexer]
  2425. else:
  2426. if is_iterator(key):
  2427. key = list(key)
  2428. indexer = self.loc._convert_to_indexer(key, axis=1,
  2429. raise_missing=True)
  2430. # take() does not accept boolean indexers
  2431. if getattr(indexer, "dtype", None) == bool:
  2432. indexer = np.where(indexer)[0]
  2433. data = self._take(indexer, axis=1)
  2434. if is_single_key:
  2435. # What does looking for a single key in a non-unique index return?
  2436. # The behavior is inconsistent. It returns a Series, except when
  2437. # - the key itself is repeated (test on data.shape, #9519), or
  2438. # - we have a MultiIndex on columns (test on self.columns, #21309)
  2439. if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
  2440. data = data[key]
  2441. return data
  2442. def _getitem_bool_array(self, key):
  2443. # also raises Exception if object array with NA values
  2444. # warning here just in case -- previously __setitem__ was
  2445. # reindexing but __getitem__ was not; it seems more reasonable to
  2446. # go with the __setitem__ behavior since that is more consistent
  2447. # with all other indexing behavior
  2448. if isinstance(key, Series) and not key.index.equals(self.index):
  2449. warnings.warn("Boolean Series key will be reindexed to match "
  2450. "DataFrame index.", UserWarning, stacklevel=3)
  2451. elif len(key) != len(self.index):
  2452. raise ValueError('Item wrong length %d instead of %d.' %
  2453. (len(key), len(self.index)))
  2454. # check_bool_indexer will throw exception if Series key cannot
  2455. # be reindexed to match DataFrame rows
  2456. key = check_bool_indexer(self.index, key)
  2457. indexer = key.nonzero()[0]
  2458. return self._take(indexer, axis=0)
  2459. def _getitem_multilevel(self, key):
  2460. loc = self.columns.get_loc(key)
  2461. if isinstance(loc, (slice, Series, np.ndarray, Index)):
  2462. new_columns = self.columns[loc]
  2463. result_columns = maybe_droplevels(new_columns, key)
  2464. if self._is_mixed_type:
  2465. result = self.reindex(columns=new_columns)
  2466. result.columns = result_columns
  2467. else:
  2468. new_values = self.values[:, loc]
  2469. result = self._constructor(new_values, index=self.index,
  2470. columns=result_columns)
  2471. result = result.__finalize__(self)
  2472. # If there is only one column being returned, and its name is
  2473. # either an empty string, or a tuple with an empty string as its
  2474. # first element, then treat the empty string as a placeholder
  2475. # and return the column as if the user had provided that empty
  2476. # string in the key. If the result is a Series, exclude the
  2477. # implied empty string from its name.
  2478. if len(result.columns) == 1:
  2479. top = result.columns[0]
  2480. if isinstance(top, tuple):
  2481. top = top[0]
  2482. if top == '':
  2483. result = result['']
  2484. if isinstance(result, Series):
  2485. result = self._constructor_sliced(result,
  2486. index=self.index,
  2487. name=key)
  2488. result._set_is_copy(self)
  2489. return result
  2490. else:
  2491. return self._get_item_cache(key)
  2492. def _getitem_frame(self, key):
  2493. if key.values.size and not is_bool_dtype(key.values):
  2494. raise ValueError('Must pass DataFrame with boolean values only')
  2495. return self.where(key)
  2496. def query(self, expr, inplace=False, **kwargs):
  2497. """
  2498. Query the columns of a DataFrame with a boolean expression.
  2499. Parameters
  2500. ----------
  2501. expr : str
  2502. The query string to evaluate. You can refer to variables
  2503. in the environment by prefixing them with an '@' character like
  2504. ``@a + b``.
  2505. inplace : bool
  2506. Whether the query should modify the data in place or return
  2507. a modified copy.
  2508. **kwargs
  2509. See the documentation for :func:`eval` for complete details
  2510. on the keyword arguments accepted by :meth:`DataFrame.query`.
  2511. .. versionadded:: 0.18.0
  2512. Returns
  2513. -------
  2514. DataFrame
  2515. DataFrame resulting from the provided query expression.
  2516. See Also
  2517. --------
  2518. eval : Evaluate a string describing operations on
  2519. DataFrame columns.
  2520. DataFrame.eval : Evaluate a string describing operations on
  2521. DataFrame columns.
  2522. Notes
  2523. -----
  2524. The result of the evaluation of this expression is first passed to
  2525. :attr:`DataFrame.loc` and if that fails because of a
  2526. multidimensional key (e.g., a DataFrame) then the result will be passed
  2527. to :meth:`DataFrame.__getitem__`.
  2528. This method uses the top-level :func:`eval` function to
  2529. evaluate the passed query.
  2530. The :meth:`~pandas.DataFrame.query` method uses a slightly
  2531. modified Python syntax by default. For example, the ``&`` and ``|``
  2532. (bitwise) operators have the precedence of their boolean cousins,
  2533. :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
  2534. however the semantics are different.
  2535. You can change the semantics of the expression by passing the keyword
  2536. argument ``parser='python'``. This enforces the same semantics as
  2537. evaluation in Python space. Likewise, you can pass ``engine='python'``
  2538. to evaluate an expression using Python itself as a backend. This is not
  2539. recommended as it is inefficient compared to using ``numexpr`` as the
  2540. engine.
  2541. The :attr:`DataFrame.index` and
  2542. :attr:`DataFrame.columns` attributes of the
  2543. :class:`~pandas.DataFrame` instance are placed in the query namespace
  2544. by default, which allows you to treat both the index and columns of the
  2545. frame as a column in the frame.
  2546. The identifier ``index`` is used for the frame index; you can also
  2547. use the name of the index to identify it in a query. Please note that
  2548. Python keywords may not be used as identifiers.
  2549. For further details and examples see the ``query`` documentation in
  2550. :ref:`indexing <indexing.query>`.
  2551. Examples
  2552. --------
  2553. >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
  2554. >>> df
  2555. A B
  2556. 0 1 10
  2557. 1 2 8
  2558. 2 3 6
  2559. 3 4 4
  2560. 4 5 2
  2561. >>> df.query('A > B')
  2562. A B
  2563. 4 5 2
  2564. The previous expression is equivalent to
  2565. >>> df[df.A > df.B]
  2566. A B
  2567. 4 5 2
  2568. """
  2569. inplace = validate_bool_kwarg(inplace, 'inplace')
  2570. if not isinstance(expr, compat.string_types):
  2571. msg = "expr must be a string to be evaluated, {0} given"
  2572. raise ValueError(msg.format(type(expr)))
  2573. kwargs['level'] = kwargs.pop('level', 0) + 1
  2574. kwargs['target'] = None
  2575. res = self.eval(expr, **kwargs)
  2576. try:
  2577. new_data = self.loc[res]
  2578. except ValueError:
  2579. # when res is multi-dimensional loc raises, but this is sometimes a
  2580. # valid query
  2581. new_data = self[res]
  2582. if inplace:
  2583. self._update_inplace(new_data)
  2584. else:
  2585. return new_data
  2586. def eval(self, expr, inplace=False, **kwargs):
  2587. """
  2588. Evaluate a string describing operations on DataFrame columns.
  2589. Operates on columns only, not specific rows or elements. This allows
  2590. `eval` to run arbitrary code, which can make you vulnerable to code
  2591. injection if you pass user input to this function.
  2592. Parameters
  2593. ----------
  2594. expr : str
  2595. The expression string to evaluate.
  2596. inplace : bool, default False
  2597. If the expression contains an assignment, whether to perform the
  2598. operation inplace and mutate the existing DataFrame. Otherwise,
  2599. a new DataFrame is returned.
  2600. .. versionadded:: 0.18.0.
  2601. kwargs : dict
  2602. See the documentation for :func:`eval` for complete details
  2603. on the keyword arguments accepted by
  2604. :meth:`~pandas.DataFrame.query`.
  2605. Returns
  2606. -------
  2607. ndarray, scalar, or pandas object
  2608. The result of the evaluation.
  2609. See Also
  2610. --------
  2611. DataFrame.query : Evaluates a boolean expression to query the columns
  2612. of a frame.
  2613. DataFrame.assign : Can evaluate an expression or function to create new
  2614. values for a column.
  2615. eval : Evaluate a Python expression as a string using various
  2616. backends.
  2617. Notes
  2618. -----
  2619. For more details see the API documentation for :func:`~eval`.
  2620. For detailed examples see :ref:`enhancing performance with eval
  2621. <enhancingperf.eval>`.
  2622. Examples
  2623. --------
  2624. >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
  2625. >>> df
  2626. A B
  2627. 0 1 10
  2628. 1 2 8
  2629. 2 3 6
  2630. 3 4 4
  2631. 4 5 2
  2632. >>> df.eval('A + B')
  2633. 0 11
  2634. 1 10
  2635. 2 9
  2636. 3 8
  2637. 4 7
  2638. dtype: int64
  2639. Assignment is allowed though by default the original DataFrame is not
  2640. modified.
  2641. >>> df.eval('C = A + B')
  2642. A B C
  2643. 0 1 10 11
  2644. 1 2 8 10
  2645. 2 3 6 9
  2646. 3 4 4 8
  2647. 4 5 2 7
  2648. >>> df
  2649. A B
  2650. 0 1 10
  2651. 1 2 8
  2652. 2 3 6
  2653. 3 4 4
  2654. 4 5 2
  2655. Use ``inplace=True`` to modify the original DataFrame.
  2656. >>> df.eval('C = A + B', inplace=True)
  2657. >>> df
  2658. A B C
  2659. 0 1 10 11
  2660. 1 2 8 10
  2661. 2 3 6 9
  2662. 3 4 4 8
  2663. 4 5 2 7
  2664. """
  2665. from pandas.core.computation.eval import eval as _eval
  2666. inplace = validate_bool_kwarg(inplace, 'inplace')
  2667. resolvers = kwargs.pop('resolvers', None)
  2668. kwargs['level'] = kwargs.pop('level', 0) + 1
  2669. if resolvers is None:
  2670. index_resolvers = self._get_index_resolvers()
  2671. resolvers = dict(self.iteritems()), index_resolvers
  2672. if 'target' not in kwargs:
  2673. kwargs['target'] = self
  2674. kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
  2675. return _eval(expr, inplace=inplace, **kwargs)
  2676. def select_dtypes(self, include=None, exclude=None):
  2677. """
  2678. Return a subset of the DataFrame's columns based on the column dtypes.
  2679. Parameters
  2680. ----------
  2681. include, exclude : scalar or list-like
  2682. A selection of dtypes or strings to be included/excluded. At least
  2683. one of these parameters must be supplied.
  2684. Returns
  2685. -------
  2686. DataFrame
  2687. The subset of the frame including the dtypes in ``include`` and
  2688. excluding the dtypes in ``exclude``.
  2689. Raises
  2690. ------
  2691. ValueError
  2692. * If both of ``include`` and ``exclude`` are empty
  2693. * If ``include`` and ``exclude`` have overlapping elements
  2694. * If any kind of string dtype is passed in.
  2695. Notes
  2696. -----
  2697. * To select all *numeric* types, use ``np.number`` or ``'number'``
  2698. * To select strings you must use the ``object`` dtype, but note that
  2699. this will return *all* object dtype columns
  2700. * See the `numpy dtype hierarchy
  2701. <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
  2702. * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
  2703. ``'datetime64'``
  2704. * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
  2705. ``'timedelta64'``
  2706. * To select Pandas categorical dtypes, use ``'category'``
  2707. * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
  2708. 0.20.0) or ``'datetime64[ns, tz]'``
  2709. Examples
  2710. --------
  2711. >>> df = pd.DataFrame({'a': [1, 2] * 3,
  2712. ... 'b': [True, False] * 3,
  2713. ... 'c': [1.0, 2.0] * 3})
  2714. >>> df
  2715. a b c
  2716. 0 1 True 1.0
  2717. 1 2 False 2.0
  2718. 2 1 True 1.0
  2719. 3 2 False 2.0
  2720. 4 1 True 1.0
  2721. 5 2 False 2.0
  2722. >>> df.select_dtypes(include='bool')
  2723. b
  2724. 0 True
  2725. 1 False
  2726. 2 True
  2727. 3 False
  2728. 4 True
  2729. 5 False
  2730. >>> df.select_dtypes(include=['float64'])
  2731. c
  2732. 0 1.0
  2733. 1 2.0
  2734. 2 1.0
  2735. 3 2.0
  2736. 4 1.0
  2737. 5 2.0
  2738. >>> df.select_dtypes(exclude=['int'])
  2739. b c
  2740. 0 True 1.0
  2741. 1 False 2.0
  2742. 2 True 1.0
  2743. 3 False 2.0
  2744. 4 True 1.0
  2745. 5 False 2.0
  2746. """
  2747. def _get_info_slice(obj, indexer):
  2748. """Slice the info axis of `obj` with `indexer`."""
  2749. if not hasattr(obj, '_info_axis_number'):
  2750. msg = 'object of type {typ!r} has no info axis'
  2751. raise TypeError(msg.format(typ=type(obj).__name__))
  2752. slices = [slice(None)] * obj.ndim
  2753. slices[obj._info_axis_number] = indexer
  2754. return tuple(slices)
  2755. if not is_list_like(include):
  2756. include = (include,) if include is not None else ()
  2757. if not is_list_like(exclude):
  2758. exclude = (exclude,) if exclude is not None else ()
  2759. selection = tuple(map(frozenset, (include, exclude)))
  2760. if not any(selection):
  2761. raise ValueError('at least one of include or exclude must be '
  2762. 'nonempty')
  2763. # convert the myriad valid dtypes object to a single representation
  2764. include, exclude = map(
  2765. lambda x: frozenset(map(infer_dtype_from_object, x)), selection)
  2766. for dtypes in (include, exclude):
  2767. invalidate_string_dtypes(dtypes)
  2768. # can't both include AND exclude!
  2769. if not include.isdisjoint(exclude):
  2770. raise ValueError('include and exclude overlap on {inc_ex}'.format(
  2771. inc_ex=(include & exclude)))
  2772. # empty include/exclude -> defaults to True
  2773. # three cases (we've already raised if both are empty)
  2774. # case 1: empty include, nonempty exclude
  2775. # we have True, True, ... True for include, same for exclude
  2776. # in the loop below we get the excluded
  2777. # and when we call '&' below we get only the excluded
  2778. # case 2: nonempty include, empty exclude
  2779. # same as case 1, but with include
  2780. # case 3: both nonempty
  2781. # the "union" of the logic of case 1 and case 2:
  2782. # we get the included and excluded, and return their logical and
  2783. include_these = Series(not bool(include), index=self.columns)
  2784. exclude_these = Series(not bool(exclude), index=self.columns)
  2785. def is_dtype_instance_mapper(idx, dtype):
  2786. return idx, functools.partial(issubclass, dtype.type)
  2787. for idx, f in itertools.starmap(is_dtype_instance_mapper,
  2788. enumerate(self.dtypes)):
  2789. if include: # checks for the case of empty include or exclude
  2790. include_these.iloc[idx] = any(map(f, include))
  2791. if exclude:
  2792. exclude_these.iloc[idx] = not any(map(f, exclude))
  2793. dtype_indexer = include_these & exclude_these
  2794. return self.loc[_get_info_slice(self, dtype_indexer)]
  2795. def _box_item_values(self, key, values):
  2796. items = self.columns[self.columns.get_loc(key)]
  2797. if values.ndim == 2:
  2798. return self._constructor(values.T, columns=items, index=self.index)
  2799. else:
  2800. return self._box_col_values(values, items)
  2801. def _box_col_values(self, values, items):
  2802. """
  2803. Provide boxed values for a column.
  2804. """
  2805. klass = self._constructor_sliced
  2806. return klass(values, index=self.index, name=items, fastpath=True)
  2807. def __setitem__(self, key, value):
  2808. key = com.apply_if_callable(key, self)
  2809. # see if we can slice the rows
  2810. indexer = convert_to_index_sliceable(self, key)
  2811. if indexer is not None:
  2812. return self._setitem_slice(indexer, value)
  2813. if isinstance(key, DataFrame) or getattr(key, 'ndim', None) == 2:
  2814. self._setitem_frame(key, value)
  2815. elif isinstance(key, (Series, np.ndarray, list, Index)):
  2816. self._setitem_array(key, value)
  2817. else:
  2818. # set column
  2819. self._set_item(key, value)
  2820. def _setitem_slice(self, key, value):
  2821. self._check_setitem_copy()
  2822. self.loc._setitem_with_indexer(key, value)
  2823. def _setitem_array(self, key, value):
  2824. # also raises Exception if object array with NA values
  2825. if com.is_bool_indexer(key):
  2826. if len(key) != len(self.index):
  2827. raise ValueError('Item wrong length %d instead of %d!' %
  2828. (len(key), len(self.index)))
  2829. key = check_bool_indexer(self.index, key)
  2830. indexer = key.nonzero()[0]
  2831. self._check_setitem_copy()
  2832. self.loc._setitem_with_indexer(indexer, value)
  2833. else:
  2834. if isinstance(value, DataFrame):
  2835. if len(value.columns) != len(key):
  2836. raise ValueError('Columns must be same length as key')
  2837. for k1, k2 in zip(key, value.columns):
  2838. self[k1] = value[k2]
  2839. else:
  2840. indexer = self.loc._convert_to_indexer(key, axis=1)
  2841. self._check_setitem_copy()
  2842. self.loc._setitem_with_indexer((slice(None), indexer), value)
  2843. def _setitem_frame(self, key, value):
  2844. # support boolean setting with DataFrame input, e.g.
  2845. # df[df > df2] = 0
  2846. if isinstance(key, np.ndarray):
  2847. if key.shape != self.shape:
  2848. raise ValueError(
  2849. 'Array conditional must be same shape as self'
  2850. )
  2851. key = self._constructor(key, **self._construct_axes_dict())
  2852. if key.values.size and not is_bool_dtype(key.values):
  2853. raise TypeError(
  2854. 'Must pass DataFrame or 2-d ndarray with boolean values only'
  2855. )
  2856. self._check_inplace_setting(value)
  2857. self._check_setitem_copy()
  2858. self._where(-key, value, inplace=True)
  2859. def _ensure_valid_index(self, value):
  2860. """
  2861. Ensure that if we don't have an index, that we can create one from the
  2862. passed value.
  2863. """
  2864. # GH5632, make sure that we are a Series convertible
  2865. if not len(self.index) and is_list_like(value):
  2866. try:
  2867. value = Series(value)
  2868. except (ValueError, NotImplementedError, TypeError):
  2869. raise ValueError('Cannot set a frame with no defined index '
  2870. 'and a value that cannot be converted to a '
  2871. 'Series')
  2872. self._data = self._data.reindex_axis(value.index.copy(), axis=1,
  2873. fill_value=np.nan)
  2874. def _set_item(self, key, value):
  2875. """
  2876. Add series to DataFrame in specified column.
  2877. If series is a numpy-array (not a Series/TimeSeries), it must be the
  2878. same length as the DataFrames index or an error will be thrown.
  2879. Series/TimeSeries will be conformed to the DataFrames index to
  2880. ensure homogeneity.
  2881. """
  2882. self._ensure_valid_index(value)
  2883. value = self._sanitize_column(key, value)
  2884. NDFrame._set_item(self, key, value)
  2885. # check if we are modifying a copy
  2886. # try to set first as we want an invalid
  2887. # value exception to occur first
  2888. if len(self):
  2889. self._check_setitem_copy()
  2890. def insert(self, loc, column, value, allow_duplicates=False):
  2891. """
  2892. Insert column into DataFrame at specified location.
  2893. Raises a ValueError if `column` is already contained in the DataFrame,
  2894. unless `allow_duplicates` is set to True.
  2895. Parameters
  2896. ----------
  2897. loc : int
  2898. Insertion index. Must verify 0 <= loc <= len(columns)
  2899. column : string, number, or hashable object
  2900. label of the inserted column
  2901. value : int, Series, or array-like
  2902. allow_duplicates : bool, optional
  2903. """
  2904. self._ensure_valid_index(value)
  2905. value = self._sanitize_column(column, value, broadcast=False)
  2906. self._data.insert(loc, column, value,
  2907. allow_duplicates=allow_duplicates)
  2908. def assign(self, **kwargs):
  2909. r"""
  2910. Assign new columns to a DataFrame.
  2911. Returns a new object with all original columns in addition to new ones.
  2912. Existing columns that are re-assigned will be overwritten.
  2913. Parameters
  2914. ----------
  2915. **kwargs : dict of {str: callable or Series}
  2916. The column names are keywords. If the values are
  2917. callable, they are computed on the DataFrame and
  2918. assigned to the new columns. The callable must not
  2919. change input DataFrame (though pandas doesn't check it).
  2920. If the values are not callable, (e.g. a Series, scalar, or array),
  2921. they are simply assigned.
  2922. Returns
  2923. -------
  2924. DataFrame
  2925. A new DataFrame with the new columns in addition to
  2926. all the existing columns.
  2927. Notes
  2928. -----
  2929. Assigning multiple columns within the same ``assign`` is possible.
  2930. For Python 3.6 and above, later items in '\*\*kwargs' may refer to
  2931. newly created or modified columns in 'df'; items are computed and
  2932. assigned into 'df' in order. For Python 3.5 and below, the order of
  2933. keyword arguments is not specified, you cannot refer to newly created
  2934. or modified columns. All items are computed first, and then assigned
  2935. in alphabetical order.
  2936. .. versionchanged :: 0.23.0
  2937. Keyword argument order is maintained for Python 3.6 and later.
  2938. Examples
  2939. --------
  2940. >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
  2941. ... index=['Portland', 'Berkeley'])
  2942. >>> df
  2943. temp_c
  2944. Portland 17.0
  2945. Berkeley 25.0
  2946. Where the value is a callable, evaluated on `df`:
  2947. >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
  2948. temp_c temp_f
  2949. Portland 17.0 62.6
  2950. Berkeley 25.0 77.0
  2951. Alternatively, the same behavior can be achieved by directly
  2952. referencing an existing Series or sequence:
  2953. >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
  2954. temp_c temp_f
  2955. Portland 17.0 62.6
  2956. Berkeley 25.0 77.0
  2957. In Python 3.6+, you can create multiple columns within the same assign
  2958. where one of the columns depends on another one defined within the same
  2959. assign:
  2960. >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
  2961. ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
  2962. temp_c temp_f temp_k
  2963. Portland 17.0 62.6 290.15
  2964. Berkeley 25.0 77.0 298.15
  2965. """
  2966. data = self.copy()
  2967. # >= 3.6 preserve order of kwargs
  2968. if PY36:
  2969. for k, v in kwargs.items():
  2970. data[k] = com.apply_if_callable(v, data)
  2971. else:
  2972. # <= 3.5: do all calculations first...
  2973. results = OrderedDict()
  2974. for k, v in kwargs.items():
  2975. results[k] = com.apply_if_callable(v, data)
  2976. # <= 3.5 and earlier
  2977. results = sorted(results.items())
  2978. # ... and then assign
  2979. for k, v in results:
  2980. data[k] = v
  2981. return data
  2982. def _sanitize_column(self, key, value, broadcast=True):
  2983. """
  2984. Ensures new columns (which go into the BlockManager as new blocks) are
  2985. always copied and converted into an array.
  2986. Parameters
  2987. ----------
  2988. key : object
  2989. value : scalar, Series, or array-like
  2990. broadcast : bool, default True
  2991. If ``key`` matches multiple duplicate column names in the
  2992. DataFrame, this parameter indicates whether ``value`` should be
  2993. tiled so that the returned array contains a (duplicated) column for
  2994. each occurrence of the key. If False, ``value`` will not be tiled.
  2995. Returns
  2996. -------
  2997. numpy.ndarray
  2998. """
  2999. def reindexer(value):
  3000. # reindex if necessary
  3001. if value.index.equals(self.index) or not len(self.index):
  3002. value = value._values.copy()
  3003. else:
  3004. # GH 4107
  3005. try:
  3006. value = value.reindex(self.index)._values
  3007. except Exception as e:
  3008. # duplicate axis
  3009. if not value.index.is_unique:
  3010. raise e
  3011. # other
  3012. raise TypeError('incompatible index of inserted column '
  3013. 'with frame index')
  3014. return value
  3015. if isinstance(value, Series):
  3016. value = reindexer(value)
  3017. elif isinstance(value, DataFrame):
  3018. # align right-hand-side columns if self.columns
  3019. # is multi-index and self[key] is a sub-frame
  3020. if isinstance(self.columns, MultiIndex) and key in self.columns:
  3021. loc = self.columns.get_loc(key)
  3022. if isinstance(loc, (slice, Series, np.ndarray, Index)):
  3023. cols = maybe_droplevels(self.columns[loc], key)
  3024. if len(cols) and not cols.equals(value.columns):
  3025. value = value.reindex(cols, axis=1)
  3026. # now align rows
  3027. value = reindexer(value).T
  3028. elif isinstance(value, ExtensionArray):
  3029. # Explicitly copy here, instead of in sanitize_index,
  3030. # as sanitize_index won't copy an EA, even with copy=True
  3031. value = value.copy()
  3032. value = sanitize_index(value, self.index, copy=False)
  3033. elif isinstance(value, Index) or is_sequence(value):
  3034. # turn me into an ndarray
  3035. value = sanitize_index(value, self.index, copy=False)
  3036. if not isinstance(value, (np.ndarray, Index)):
  3037. if isinstance(value, list) and len(value) > 0:
  3038. value = maybe_convert_platform(value)
  3039. else:
  3040. value = com.asarray_tuplesafe(value)
  3041. elif value.ndim == 2:
  3042. value = value.copy().T
  3043. elif isinstance(value, Index):
  3044. value = value.copy(deep=True)
  3045. else:
  3046. value = value.copy()
  3047. # possibly infer to datetimelike
  3048. if is_object_dtype(value.dtype):
  3049. value = maybe_infer_to_datetimelike(value)
  3050. else:
  3051. # cast ignores pandas dtypes. so save the dtype first
  3052. infer_dtype, _ = infer_dtype_from_scalar(
  3053. value, pandas_dtype=True)
  3054. # upcast
  3055. value = cast_scalar_to_array(len(self.index), value)
  3056. value = maybe_cast_to_datetime(value, infer_dtype)
  3057. # return internal types directly
  3058. if is_extension_type(value) or is_extension_array_dtype(value):
  3059. return value
  3060. # broadcast across multiple columns if necessary
  3061. if broadcast and key in self.columns and value.ndim == 1:
  3062. if (not self.columns.is_unique or
  3063. isinstance(self.columns, MultiIndex)):
  3064. existing_piece = self[key]
  3065. if isinstance(existing_piece, DataFrame):
  3066. value = np.tile(value, (len(existing_piece.columns), 1))
  3067. return np.atleast_2d(np.asarray(value))
  3068. @property
  3069. def _series(self):
  3070. return {item: Series(self._data.iget(idx), index=self.index, name=item)
  3071. for idx, item in enumerate(self.columns)}
  3072. def lookup(self, row_labels, col_labels):
  3073. """
  3074. Label-based "fancy indexing" function for DataFrame.
  3075. Given equal-length arrays of row and column labels, return an
  3076. array of the values corresponding to each (row, col) pair.
  3077. Parameters
  3078. ----------
  3079. row_labels : sequence
  3080. The row labels to use for lookup
  3081. col_labels : sequence
  3082. The column labels to use for lookup
  3083. Notes
  3084. -----
  3085. Akin to::
  3086. result = [df.get_value(row, col)
  3087. for row, col in zip(row_labels, col_labels)]
  3088. Examples
  3089. --------
  3090. values : ndarray
  3091. The found values
  3092. """
  3093. n = len(row_labels)
  3094. if n != len(col_labels):
  3095. raise ValueError('Row labels must have same size as column labels')
  3096. thresh = 1000
  3097. if not self._is_mixed_type or n > thresh:
  3098. values = self.values
  3099. ridx = self.index.get_indexer(row_labels)
  3100. cidx = self.columns.get_indexer(col_labels)
  3101. if (ridx == -1).any():
  3102. raise KeyError('One or more row labels was not found')
  3103. if (cidx == -1).any():
  3104. raise KeyError('One or more column labels was not found')
  3105. flat_index = ridx * len(self.columns) + cidx
  3106. result = values.flat[flat_index]
  3107. else:
  3108. result = np.empty(n, dtype='O')
  3109. for i, (r, c) in enumerate(zip(row_labels, col_labels)):
  3110. result[i] = self._get_value(r, c)
  3111. if is_object_dtype(result):
  3112. result = lib.maybe_convert_objects(result)
  3113. return result
  3114. # ----------------------------------------------------------------------
  3115. # Reindexing and alignment
  3116. def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
  3117. copy):
  3118. frame = self
  3119. columns = axes['columns']
  3120. if columns is not None:
  3121. frame = frame._reindex_columns(columns, method, copy, level,
  3122. fill_value, limit, tolerance)
  3123. index = axes['index']
  3124. if index is not None:
  3125. frame = frame._reindex_index(index, method, copy, level,
  3126. fill_value, limit, tolerance)
  3127. return frame
  3128. def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan,
  3129. limit=None, tolerance=None):
  3130. new_index, indexer = self.index.reindex(new_index, method=method,
  3131. level=level, limit=limit,
  3132. tolerance=tolerance)
  3133. return self._reindex_with_indexers({0: [new_index, indexer]},
  3134. copy=copy, fill_value=fill_value,
  3135. allow_dups=False)
  3136. def _reindex_columns(self, new_columns, method, copy, level,
  3137. fill_value=None, limit=None, tolerance=None):
  3138. new_columns, indexer = self.columns.reindex(new_columns, method=method,
  3139. level=level, limit=limit,
  3140. tolerance=tolerance)
  3141. return self._reindex_with_indexers({1: [new_columns, indexer]},
  3142. copy=copy, fill_value=fill_value,
  3143. allow_dups=False)
  3144. def _reindex_multi(self, axes, copy, fill_value):
  3145. """
  3146. We are guaranteed non-Nones in the axes.
  3147. """
  3148. new_index, row_indexer = self.index.reindex(axes['index'])
  3149. new_columns, col_indexer = self.columns.reindex(axes['columns'])
  3150. if row_indexer is not None and col_indexer is not None:
  3151. indexer = row_indexer, col_indexer
  3152. new_values = algorithms.take_2d_multi(self.values, indexer,
  3153. fill_value=fill_value)
  3154. return self._constructor(new_values, index=new_index,
  3155. columns=new_columns)
  3156. else:
  3157. return self._reindex_with_indexers({0: [new_index, row_indexer],
  3158. 1: [new_columns, col_indexer]},
  3159. copy=copy,
  3160. fill_value=fill_value)
  3161. @Appender(_shared_docs['align'] % _shared_doc_kwargs)
  3162. def align(self, other, join='outer', axis=None, level=None, copy=True,
  3163. fill_value=None, method=None, limit=None, fill_axis=0,
  3164. broadcast_axis=None):
  3165. return super(DataFrame, self).align(other, join=join, axis=axis,
  3166. level=level, copy=copy,
  3167. fill_value=fill_value,
  3168. method=method, limit=limit,
  3169. fill_axis=fill_axis,
  3170. broadcast_axis=broadcast_axis)
  3171. @Substitution(**_shared_doc_kwargs)
  3172. @Appender(NDFrame.reindex.__doc__)
  3173. @rewrite_axis_style_signature('labels', [('method', None),
  3174. ('copy', True),
  3175. ('level', None),
  3176. ('fill_value', np.nan),
  3177. ('limit', None),
  3178. ('tolerance', None)])
  3179. def reindex(self, *args, **kwargs):
  3180. axes = validate_axis_style_args(self, args, kwargs, 'labels',
  3181. 'reindex')
  3182. kwargs.update(axes)
  3183. # Pop these, since the values are in `kwargs` under different names
  3184. kwargs.pop('axis', None)
  3185. kwargs.pop('labels', None)
  3186. return super(DataFrame, self).reindex(**kwargs)
  3187. @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
  3188. def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
  3189. limit=None, fill_value=np.nan):
  3190. return super(DataFrame,
  3191. self).reindex_axis(labels=labels, axis=axis,
  3192. method=method, level=level, copy=copy,
  3193. limit=limit, fill_value=fill_value)
  3194. def drop(self, labels=None, axis=0, index=None, columns=None,
  3195. level=None, inplace=False, errors='raise'):
  3196. """
  3197. Drop specified labels from rows or columns.
  3198. Remove rows or columns by specifying label names and corresponding
  3199. axis, or by specifying directly index or column names. When using a
  3200. multi-index, labels on different levels can be removed by specifying
  3201. the level.
  3202. Parameters
  3203. ----------
  3204. labels : single label or list-like
  3205. Index or column labels to drop.
  3206. axis : {0 or 'index', 1 or 'columns'}, default 0
  3207. Whether to drop labels from the index (0 or 'index') or
  3208. columns (1 or 'columns').
  3209. index : single label or list-like
  3210. Alternative to specifying axis (``labels, axis=0``
  3211. is equivalent to ``index=labels``).
  3212. .. versionadded:: 0.21.0
  3213. columns : single label or list-like
  3214. Alternative to specifying axis (``labels, axis=1``
  3215. is equivalent to ``columns=labels``).
  3216. .. versionadded:: 0.21.0
  3217. level : int or level name, optional
  3218. For MultiIndex, level from which the labels will be removed.
  3219. inplace : bool, default False
  3220. If True, do operation inplace and return None.
  3221. errors : {'ignore', 'raise'}, default 'raise'
  3222. If 'ignore', suppress error and only existing labels are
  3223. dropped.
  3224. Returns
  3225. -------
  3226. DataFrame
  3227. DataFrame without the removed index or column labels.
  3228. Raises
  3229. ------
  3230. KeyError
  3231. If any of the labels is not found in the selected axis.
  3232. See Also
  3233. --------
  3234. DataFrame.loc : Label-location based indexer for selection by label.
  3235. DataFrame.dropna : Return DataFrame with labels on given axis omitted
  3236. where (all or any) data are missing.
  3237. DataFrame.drop_duplicates : Return DataFrame with duplicate rows
  3238. removed, optionally only considering certain columns.
  3239. Series.drop : Return Series with specified index labels removed.
  3240. Examples
  3241. --------
  3242. >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
  3243. ... columns=['A', 'B', 'C', 'D'])
  3244. >>> df
  3245. A B C D
  3246. 0 0 1 2 3
  3247. 1 4 5 6 7
  3248. 2 8 9 10 11
  3249. Drop columns
  3250. >>> df.drop(['B', 'C'], axis=1)
  3251. A D
  3252. 0 0 3
  3253. 1 4 7
  3254. 2 8 11
  3255. >>> df.drop(columns=['B', 'C'])
  3256. A D
  3257. 0 0 3
  3258. 1 4 7
  3259. 2 8 11
  3260. Drop a row by index
  3261. >>> df.drop([0, 1])
  3262. A B C D
  3263. 2 8 9 10 11
  3264. Drop columns and/or rows of MultiIndex DataFrame
  3265. >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
  3266. ... ['speed', 'weight', 'length']],
  3267. ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
  3268. ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
  3269. >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
  3270. ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
  3271. ... [250, 150], [1.5, 0.8], [320, 250],
  3272. ... [1, 0.8], [0.3, 0.2]])
  3273. >>> df
  3274. big small
  3275. lama speed 45.0 30.0
  3276. weight 200.0 100.0
  3277. length 1.5 1.0
  3278. cow speed 30.0 20.0
  3279. weight 250.0 150.0
  3280. length 1.5 0.8
  3281. falcon speed 320.0 250.0
  3282. weight 1.0 0.8
  3283. length 0.3 0.2
  3284. >>> df.drop(index='cow', columns='small')
  3285. big
  3286. lama speed 45.0
  3287. weight 200.0
  3288. length 1.5
  3289. falcon speed 320.0
  3290. weight 1.0
  3291. length 0.3
  3292. >>> df.drop(index='length', level=1)
  3293. big small
  3294. lama speed 45.0 30.0
  3295. weight 200.0 100.0
  3296. cow speed 30.0 20.0
  3297. weight 250.0 150.0
  3298. falcon speed 320.0 250.0
  3299. weight 1.0 0.8
  3300. """
  3301. return super(DataFrame, self).drop(labels=labels, axis=axis,
  3302. index=index, columns=columns,
  3303. level=level, inplace=inplace,
  3304. errors=errors)
  3305. @rewrite_axis_style_signature('mapper', [('copy', True),
  3306. ('inplace', False),
  3307. ('level', None),
  3308. ('errors', 'ignore')])
  3309. def rename(self, *args, **kwargs):
  3310. """
  3311. Alter axes labels.
  3312. Function / dict values must be unique (1-to-1). Labels not contained in
  3313. a dict / Series will be left as-is. Extra labels listed don't throw an
  3314. error.
  3315. See the :ref:`user guide <basics.rename>` for more.
  3316. Parameters
  3317. ----------
  3318. mapper : dict-like or function
  3319. Dict-like or functions transformations to apply to
  3320. that axis' values. Use either ``mapper`` and ``axis`` to
  3321. specify the axis to target with ``mapper``, or ``index`` and
  3322. ``columns``.
  3323. index : dict-like or function
  3324. Alternative to specifying axis (``mapper, axis=0``
  3325. is equivalent to ``index=mapper``).
  3326. columns : dict-like or function
  3327. Alternative to specifying axis (``mapper, axis=1``
  3328. is equivalent to ``columns=mapper``).
  3329. axis : int or str
  3330. Axis to target with ``mapper``. Can be either the axis name
  3331. ('index', 'columns') or number (0, 1). The default is 'index'.
  3332. copy : bool, default True
  3333. Also copy underlying data.
  3334. inplace : bool, default False
  3335. Whether to return a new DataFrame. If True then value of copy is
  3336. ignored.
  3337. level : int or level name, default None
  3338. In case of a MultiIndex, only rename labels in the specified
  3339. level.
  3340. errors : {'ignore', 'raise'}, default 'ignore'
  3341. If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
  3342. or `columns` contains labels that are not present in the Index
  3343. being transformed.
  3344. If 'ignore', existing keys will be renamed and extra keys will be
  3345. ignored.
  3346. Returns
  3347. -------
  3348. DataFrame
  3349. DataFrame with the renamed axis labels.
  3350. Raises
  3351. ------
  3352. KeyError
  3353. If any of the labels is not found in the selected axis and
  3354. "errors='raise'".
  3355. See Also
  3356. --------
  3357. DataFrame.rename_axis : Set the name of the axis.
  3358. Examples
  3359. --------
  3360. ``DataFrame.rename`` supports two calling conventions
  3361. * ``(index=index_mapper, columns=columns_mapper, ...)``
  3362. * ``(mapper, axis={'index', 'columns'}, ...)``
  3363. We *highly* recommend using keyword arguments to clarify your
  3364. intent.
  3365. >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  3366. >>> df.rename(index=str, columns={"A": "a", "B": "c"})
  3367. a c
  3368. 0 1 4
  3369. 1 2 5
  3370. 2 3 6
  3371. >>> df.rename(index=str, columns={"A": "a", "C": "c"})
  3372. a B
  3373. 0 1 4
  3374. 1 2 5
  3375. 2 3 6
  3376. >>> df.rename(index=str, columns={"A": "a", "C": "c"}, errors="raise")
  3377. Traceback (most recent call last):
  3378. KeyError: ['C'] not found in axis
  3379. Using axis-style parameters
  3380. >>> df.rename(str.lower, axis='columns')
  3381. a b
  3382. 0 1 4
  3383. 1 2 5
  3384. 2 3 6
  3385. >>> df.rename({1: 2, 2: 4}, axis='index')
  3386. A B
  3387. 0 1 4
  3388. 2 2 5
  3389. 4 3 6
  3390. """
  3391. axes = validate_axis_style_args(self, args, kwargs, 'mapper', 'rename')
  3392. kwargs.update(axes)
  3393. # Pop these, since the values are in `kwargs` under different names
  3394. kwargs.pop('axis', None)
  3395. kwargs.pop('mapper', None)
  3396. return super(DataFrame, self).rename(**kwargs)
  3397. @Substitution(**_shared_doc_kwargs)
  3398. @Appender(NDFrame.fillna.__doc__)
  3399. def fillna(self, value=None, method=None, axis=None, inplace=False,
  3400. limit=None, downcast=None, **kwargs):
  3401. return super(DataFrame,
  3402. self).fillna(value=value, method=method, axis=axis,
  3403. inplace=inplace, limit=limit,
  3404. downcast=downcast, **kwargs)
  3405. @Appender(_shared_docs['replace'] % _shared_doc_kwargs)
  3406. def replace(self, to_replace=None, value=None, inplace=False, limit=None,
  3407. regex=False, method='pad'):
  3408. return super(DataFrame, self).replace(to_replace=to_replace,
  3409. value=value, inplace=inplace,
  3410. limit=limit, regex=regex,
  3411. method=method)
  3412. @Appender(_shared_docs['shift'] % _shared_doc_kwargs)
  3413. def shift(self, periods=1, freq=None, axis=0, fill_value=None):
  3414. return super(DataFrame, self).shift(periods=periods, freq=freq,
  3415. axis=axis, fill_value=fill_value)
  3416. def set_index(self, keys, drop=True, append=False, inplace=False,
  3417. verify_integrity=False):
  3418. """
  3419. Set the DataFrame index using existing columns.
  3420. Set the DataFrame index (row labels) using one or more existing
  3421. columns or arrays (of the correct length). The index can replace the
  3422. existing index or expand on it.
  3423. Parameters
  3424. ----------
  3425. keys : label or array-like or list of labels/arrays
  3426. This parameter can be either a single column key, a single array of
  3427. the same length as the calling DataFrame, or a list containing an
  3428. arbitrary combination of column keys and arrays. Here, "array"
  3429. encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
  3430. instances of :class:`abc.Iterator`.
  3431. drop : bool, default True
  3432. Delete columns to be used as the new index.
  3433. append : bool, default False
  3434. Whether to append columns to existing index.
  3435. inplace : bool, default False
  3436. Modify the DataFrame in place (do not create a new object).
  3437. verify_integrity : bool, default False
  3438. Check the new index for duplicates. Otherwise defer the check until
  3439. necessary. Setting to False will improve the performance of this
  3440. method.
  3441. Returns
  3442. -------
  3443. DataFrame
  3444. Changed row labels.
  3445. See Also
  3446. --------
  3447. DataFrame.reset_index : Opposite of set_index.
  3448. DataFrame.reindex : Change to new indices or expand indices.
  3449. DataFrame.reindex_like : Change to same indices as other DataFrame.
  3450. Examples
  3451. --------
  3452. >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
  3453. ... 'year': [2012, 2014, 2013, 2014],
  3454. ... 'sale': [55, 40, 84, 31]})
  3455. >>> df
  3456. month year sale
  3457. 0 1 2012 55
  3458. 1 4 2014 40
  3459. 2 7 2013 84
  3460. 3 10 2014 31
  3461. Set the index to become the 'month' column:
  3462. >>> df.set_index('month')
  3463. year sale
  3464. month
  3465. 1 2012 55
  3466. 4 2014 40
  3467. 7 2013 84
  3468. 10 2014 31
  3469. Create a MultiIndex using columns 'year' and 'month':
  3470. >>> df.set_index(['year', 'month'])
  3471. sale
  3472. year month
  3473. 2012 1 55
  3474. 2014 4 40
  3475. 2013 7 84
  3476. 2014 10 31
  3477. Create a MultiIndex using an Index and a column:
  3478. >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
  3479. month sale
  3480. year
  3481. 1 2012 1 55
  3482. 2 2014 4 40
  3483. 3 2013 7 84
  3484. 4 2014 10 31
  3485. Create a MultiIndex using two Series:
  3486. >>> s = pd.Series([1, 2, 3, 4])
  3487. >>> df.set_index([s, s**2])
  3488. month year sale
  3489. 1 1 1 2012 55
  3490. 2 4 4 2014 40
  3491. 3 9 7 2013 84
  3492. 4 16 10 2014 31
  3493. """
  3494. inplace = validate_bool_kwarg(inplace, 'inplace')
  3495. if not isinstance(keys, list):
  3496. keys = [keys]
  3497. err_msg = ('The parameter "keys" may be a column key, one-dimensional '
  3498. 'array, or a list containing only valid column keys and '
  3499. 'one-dimensional arrays.')
  3500. missing = []
  3501. for col in keys:
  3502. if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray,
  3503. list, Iterator)):
  3504. # arrays are fine as long as they are one-dimensional
  3505. # iterators get converted to list below
  3506. if getattr(col, 'ndim', 1) != 1:
  3507. raise ValueError(err_msg)
  3508. else:
  3509. # everything else gets tried as a key; see GH 24969
  3510. try:
  3511. found = col in self.columns
  3512. except TypeError:
  3513. raise TypeError(err_msg + ' Received column of '
  3514. 'type {}'.format(type(col)))
  3515. else:
  3516. if not found:
  3517. missing.append(col)
  3518. if missing:
  3519. raise KeyError('None of {} are in the columns'.format(missing))
  3520. if inplace:
  3521. frame = self
  3522. else:
  3523. frame = self.copy()
  3524. arrays = []
  3525. names = []
  3526. if append:
  3527. names = [x for x in self.index.names]
  3528. if isinstance(self.index, ABCMultiIndex):
  3529. for i in range(self.index.nlevels):
  3530. arrays.append(self.index._get_level_values(i))
  3531. else:
  3532. arrays.append(self.index)
  3533. to_remove = []
  3534. for col in keys:
  3535. if isinstance(col, ABCMultiIndex):
  3536. for n in range(col.nlevels):
  3537. arrays.append(col._get_level_values(n))
  3538. names.extend(col.names)
  3539. elif isinstance(col, (ABCIndexClass, ABCSeries)):
  3540. # if Index then not MultiIndex (treated above)
  3541. arrays.append(col)
  3542. names.append(col.name)
  3543. elif isinstance(col, (list, np.ndarray)):
  3544. arrays.append(col)
  3545. names.append(None)
  3546. elif isinstance(col, Iterator):
  3547. arrays.append(list(col))
  3548. names.append(None)
  3549. # from here, col can only be a column label
  3550. else:
  3551. arrays.append(frame[col]._values)
  3552. names.append(col)
  3553. if drop:
  3554. to_remove.append(col)
  3555. if len(arrays[-1]) != len(self):
  3556. # check newest element against length of calling frame, since
  3557. # ensure_index_from_sequences would not raise for append=False.
  3558. raise ValueError('Length mismatch: Expected {len_self} rows, '
  3559. 'received array of length {len_col}'.format(
  3560. len_self=len(self),
  3561. len_col=len(arrays[-1])
  3562. ))
  3563. index = ensure_index_from_sequences(arrays, names)
  3564. if verify_integrity and not index.is_unique:
  3565. duplicates = index[index.duplicated()].unique()
  3566. raise ValueError('Index has duplicate keys: {dup}'.format(
  3567. dup=duplicates))
  3568. # use set to handle duplicate column names gracefully in case of drop
  3569. for c in set(to_remove):
  3570. del frame[c]
  3571. # clear up memory usage
  3572. index._cleanup()
  3573. frame.index = index
  3574. if not inplace:
  3575. return frame
  3576. def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
  3577. col_fill=''):
  3578. """
  3579. Reset the index, or a level of it.
  3580. Reset the index of the DataFrame, and use the default one instead.
  3581. If the DataFrame has a MultiIndex, this method can remove one or more
  3582. levels.
  3583. Parameters
  3584. ----------
  3585. level : int, str, tuple, or list, default None
  3586. Only remove the given levels from the index. Removes all levels by
  3587. default.
  3588. drop : bool, default False
  3589. Do not try to insert index into dataframe columns. This resets
  3590. the index to the default integer index.
  3591. inplace : bool, default False
  3592. Modify the DataFrame in place (do not create a new object).
  3593. col_level : int or str, default 0
  3594. If the columns have multiple levels, determines which level the
  3595. labels are inserted into. By default it is inserted into the first
  3596. level.
  3597. col_fill : object, default ''
  3598. If the columns have multiple levels, determines how the other
  3599. levels are named. If None then the index name is repeated.
  3600. Returns
  3601. -------
  3602. DataFrame
  3603. DataFrame with the new index.
  3604. See Also
  3605. --------
  3606. DataFrame.set_index : Opposite of reset_index.
  3607. DataFrame.reindex : Change to new indices or expand indices.
  3608. DataFrame.reindex_like : Change to same indices as other DataFrame.
  3609. Examples
  3610. --------
  3611. >>> df = pd.DataFrame([('bird', 389.0),
  3612. ... ('bird', 24.0),
  3613. ... ('mammal', 80.5),
  3614. ... ('mammal', np.nan)],
  3615. ... index=['falcon', 'parrot', 'lion', 'monkey'],
  3616. ... columns=('class', 'max_speed'))
  3617. >>> df
  3618. class max_speed
  3619. falcon bird 389.0
  3620. parrot bird 24.0
  3621. lion mammal 80.5
  3622. monkey mammal NaN
  3623. When we reset the index, the old index is added as a column, and a
  3624. new sequential index is used:
  3625. >>> df.reset_index()
  3626. index class max_speed
  3627. 0 falcon bird 389.0
  3628. 1 parrot bird 24.0
  3629. 2 lion mammal 80.5
  3630. 3 monkey mammal NaN
  3631. We can use the `drop` parameter to avoid the old index being added as
  3632. a column:
  3633. >>> df.reset_index(drop=True)
  3634. class max_speed
  3635. 0 bird 389.0
  3636. 1 bird 24.0
  3637. 2 mammal 80.5
  3638. 3 mammal NaN
  3639. You can also use `reset_index` with `MultiIndex`.
  3640. >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
  3641. ... ('bird', 'parrot'),
  3642. ... ('mammal', 'lion'),
  3643. ... ('mammal', 'monkey')],
  3644. ... names=['class', 'name'])
  3645. >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
  3646. ... ('species', 'type')])
  3647. >>> df = pd.DataFrame([(389.0, 'fly'),
  3648. ... ( 24.0, 'fly'),
  3649. ... ( 80.5, 'run'),
  3650. ... (np.nan, 'jump')],
  3651. ... index=index,
  3652. ... columns=columns)
  3653. >>> df
  3654. speed species
  3655. max type
  3656. class name
  3657. bird falcon 389.0 fly
  3658. parrot 24.0 fly
  3659. mammal lion 80.5 run
  3660. monkey NaN jump
  3661. If the index has multiple levels, we can reset a subset of them:
  3662. >>> df.reset_index(level='class')
  3663. class speed species
  3664. max type
  3665. name
  3666. falcon bird 389.0 fly
  3667. parrot bird 24.0 fly
  3668. lion mammal 80.5 run
  3669. monkey mammal NaN jump
  3670. If we are not dropping the index, by default, it is placed in the top
  3671. level. We can place it in another level:
  3672. >>> df.reset_index(level='class', col_level=1)
  3673. speed species
  3674. class max type
  3675. name
  3676. falcon bird 389.0 fly
  3677. parrot bird 24.0 fly
  3678. lion mammal 80.5 run
  3679. monkey mammal NaN jump
  3680. When the index is inserted under another level, we can specify under
  3681. which one with the parameter `col_fill`:
  3682. >>> df.reset_index(level='class', col_level=1, col_fill='species')
  3683. species speed species
  3684. class max type
  3685. name
  3686. falcon bird 389.0 fly
  3687. parrot bird 24.0 fly
  3688. lion mammal 80.5 run
  3689. monkey mammal NaN jump
  3690. If we specify a nonexistent level for `col_fill`, it is created:
  3691. >>> df.reset_index(level='class', col_level=1, col_fill='genus')
  3692. genus speed species
  3693. class max type
  3694. name
  3695. falcon bird 389.0 fly
  3696. parrot bird 24.0 fly
  3697. lion mammal 80.5 run
  3698. monkey mammal NaN jump
  3699. """
  3700. inplace = validate_bool_kwarg(inplace, 'inplace')
  3701. if inplace:
  3702. new_obj = self
  3703. else:
  3704. new_obj = self.copy()
  3705. def _maybe_casted_values(index, labels=None):
  3706. values = index._values
  3707. if not isinstance(index, (PeriodIndex, DatetimeIndex)):
  3708. if values.dtype == np.object_:
  3709. values = lib.maybe_convert_objects(values)
  3710. # if we have the labels, extract the values with a mask
  3711. if labels is not None:
  3712. mask = labels == -1
  3713. # we can have situations where the whole mask is -1,
  3714. # meaning there is nothing found in labels, so make all nan's
  3715. if mask.all():
  3716. values = np.empty(len(mask))
  3717. values.fill(np.nan)
  3718. else:
  3719. values = values.take(labels)
  3720. # TODO(https://github.com/pandas-dev/pandas/issues/24206)
  3721. # Push this into maybe_upcast_putmask?
  3722. # We can't pass EAs there right now. Looks a bit
  3723. # complicated.
  3724. # So we unbox the ndarray_values, op, re-box.
  3725. values_type = type(values)
  3726. values_dtype = values.dtype
  3727. if issubclass(values_type, DatetimeLikeArray):
  3728. values = values._data
  3729. if mask.any():
  3730. values, changed = maybe_upcast_putmask(
  3731. values, mask, np.nan)
  3732. if issubclass(values_type, DatetimeLikeArray):
  3733. values = values_type(values, dtype=values_dtype)
  3734. return values
  3735. new_index = ibase.default_index(len(new_obj))
  3736. if level is not None:
  3737. if not isinstance(level, (tuple, list)):
  3738. level = [level]
  3739. level = [self.index._get_level_number(lev) for lev in level]
  3740. if len(level) < self.index.nlevels:
  3741. new_index = self.index.droplevel(level)
  3742. if not drop:
  3743. if isinstance(self.index, MultiIndex):
  3744. names = [n if n is not None else ('level_%d' % i)
  3745. for (i, n) in enumerate(self.index.names)]
  3746. to_insert = lzip(self.index.levels, self.index.codes)
  3747. else:
  3748. default = 'index' if 'index' not in self else 'level_0'
  3749. names = ([default] if self.index.name is None
  3750. else [self.index.name])
  3751. to_insert = ((self.index, None),)
  3752. multi_col = isinstance(self.columns, MultiIndex)
  3753. for i, (lev, lab) in reversed(list(enumerate(to_insert))):
  3754. if not (level is None or i in level):
  3755. continue
  3756. name = names[i]
  3757. if multi_col:
  3758. col_name = (list(name) if isinstance(name, tuple)
  3759. else [name])
  3760. if col_fill is None:
  3761. if len(col_name) not in (1, self.columns.nlevels):
  3762. raise ValueError("col_fill=None is incompatible "
  3763. "with incomplete column name "
  3764. "{}".format(name))
  3765. col_fill = col_name[0]
  3766. lev_num = self.columns._get_level_number(col_level)
  3767. name_lst = [col_fill] * lev_num + col_name
  3768. missing = self.columns.nlevels - len(name_lst)
  3769. name_lst += [col_fill] * missing
  3770. name = tuple(name_lst)
  3771. # to ndarray and maybe infer different dtype
  3772. level_values = _maybe_casted_values(lev, lab)
  3773. new_obj.insert(0, name, level_values)
  3774. new_obj.index = new_index
  3775. if not inplace:
  3776. return new_obj
  3777. # ----------------------------------------------------------------------
  3778. # Reindex-based selection methods
  3779. @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
  3780. def isna(self):
  3781. return super(DataFrame, self).isna()
  3782. @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
  3783. def isnull(self):
  3784. return super(DataFrame, self).isnull()
  3785. @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
  3786. def notna(self):
  3787. return super(DataFrame, self).notna()
  3788. @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
  3789. def notnull(self):
  3790. return super(DataFrame, self).notnull()
  3791. def dropna(self, axis=0, how='any', thresh=None, subset=None,
  3792. inplace=False):
  3793. """
  3794. Remove missing values.
  3795. See the :ref:`User Guide <missing_data>` for more on which values are
  3796. considered missing, and how to work with missing data.
  3797. Parameters
  3798. ----------
  3799. axis : {0 or 'index', 1 or 'columns'}, default 0
  3800. Determine if rows or columns which contain missing values are
  3801. removed.
  3802. * 0, or 'index' : Drop rows which contain missing values.
  3803. * 1, or 'columns' : Drop columns which contain missing value.
  3804. .. deprecated:: 0.23.0
  3805. Pass tuple or list to drop on multiple axes.
  3806. Only a single axis is allowed.
  3807. how : {'any', 'all'}, default 'any'
  3808. Determine if row or column is removed from DataFrame, when we have
  3809. at least one NA or all NA.
  3810. * 'any' : If any NA values are present, drop that row or column.
  3811. * 'all' : If all values are NA, drop that row or column.
  3812. thresh : int, optional
  3813. Require that many non-NA values.
  3814. subset : array-like, optional
  3815. Labels along other axis to consider, e.g. if you are dropping rows
  3816. these would be a list of columns to include.
  3817. inplace : bool, default False
  3818. If True, do operation inplace and return None.
  3819. Returns
  3820. -------
  3821. DataFrame
  3822. DataFrame with NA entries dropped from it.
  3823. See Also
  3824. --------
  3825. DataFrame.isna: Indicate missing values.
  3826. DataFrame.notna : Indicate existing (non-missing) values.
  3827. DataFrame.fillna : Replace missing values.
  3828. Series.dropna : Drop missing values.
  3829. Index.dropna : Drop missing indices.
  3830. Examples
  3831. --------
  3832. >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
  3833. ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
  3834. ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
  3835. ... pd.NaT]})
  3836. >>> df
  3837. name toy born
  3838. 0 Alfred NaN NaT
  3839. 1 Batman Batmobile 1940-04-25
  3840. 2 Catwoman Bullwhip NaT
  3841. Drop the rows where at least one element is missing.
  3842. >>> df.dropna()
  3843. name toy born
  3844. 1 Batman Batmobile 1940-04-25
  3845. Drop the columns where at least one element is missing.
  3846. >>> df.dropna(axis='columns')
  3847. name
  3848. 0 Alfred
  3849. 1 Batman
  3850. 2 Catwoman
  3851. Drop the rows where all elements are missing.
  3852. >>> df.dropna(how='all')
  3853. name toy born
  3854. 0 Alfred NaN NaT
  3855. 1 Batman Batmobile 1940-04-25
  3856. 2 Catwoman Bullwhip NaT
  3857. Keep only the rows with at least 2 non-NA values.
  3858. >>> df.dropna(thresh=2)
  3859. name toy born
  3860. 1 Batman Batmobile 1940-04-25
  3861. 2 Catwoman Bullwhip NaT
  3862. Define in which columns to look for missing values.
  3863. >>> df.dropna(subset=['name', 'born'])
  3864. name toy born
  3865. 1 Batman Batmobile 1940-04-25
  3866. Keep the DataFrame with valid entries in the same variable.
  3867. >>> df.dropna(inplace=True)
  3868. >>> df
  3869. name toy born
  3870. 1 Batman Batmobile 1940-04-25
  3871. """
  3872. inplace = validate_bool_kwarg(inplace, 'inplace')
  3873. if isinstance(axis, (tuple, list)):
  3874. # GH20987
  3875. msg = ("supplying multiple axes to axis is deprecated and "
  3876. "will be removed in a future version.")
  3877. warnings.warn(msg, FutureWarning, stacklevel=2)
  3878. result = self
  3879. for ax in axis:
  3880. result = result.dropna(how=how, thresh=thresh, subset=subset,
  3881. axis=ax)
  3882. else:
  3883. axis = self._get_axis_number(axis)
  3884. agg_axis = 1 - axis
  3885. agg_obj = self
  3886. if subset is not None:
  3887. ax = self._get_axis(agg_axis)
  3888. indices = ax.get_indexer_for(subset)
  3889. check = indices == -1
  3890. if check.any():
  3891. raise KeyError(list(np.compress(check, subset)))
  3892. agg_obj = self.take(indices, axis=agg_axis)
  3893. count = agg_obj.count(axis=agg_axis)
  3894. if thresh is not None:
  3895. mask = count >= thresh
  3896. elif how == 'any':
  3897. mask = count == len(agg_obj._get_axis(agg_axis))
  3898. elif how == 'all':
  3899. mask = count > 0
  3900. else:
  3901. if how is not None:
  3902. raise ValueError('invalid how option: {h}'.format(h=how))
  3903. else:
  3904. raise TypeError('must specify how or thresh')
  3905. result = self.loc(axis=axis)[mask]
  3906. if inplace:
  3907. self._update_inplace(result)
  3908. else:
  3909. return result
  3910. def drop_duplicates(self, subset=None, keep='first', inplace=False):
  3911. """
  3912. Return DataFrame with duplicate rows removed, optionally only
  3913. considering certain columns. Indexes, including time indexes
  3914. are ignored.
  3915. Parameters
  3916. ----------
  3917. subset : column label or sequence of labels, optional
  3918. Only consider certain columns for identifying duplicates, by
  3919. default use all of the columns
  3920. keep : {'first', 'last', False}, default 'first'
  3921. - ``first`` : Drop duplicates except for the first occurrence.
  3922. - ``last`` : Drop duplicates except for the last occurrence.
  3923. - False : Drop all duplicates.
  3924. inplace : boolean, default False
  3925. Whether to drop duplicates in place or to return a copy
  3926. Returns
  3927. -------
  3928. DataFrame
  3929. """
  3930. if self.empty:
  3931. return self.copy()
  3932. inplace = validate_bool_kwarg(inplace, 'inplace')
  3933. duplicated = self.duplicated(subset, keep=keep)
  3934. if inplace:
  3935. inds, = (-duplicated)._ndarray_values.nonzero()
  3936. new_data = self._data.take(inds)
  3937. self._update_inplace(new_data)
  3938. else:
  3939. return self[-duplicated]
  3940. def duplicated(self, subset=None, keep='first'):
  3941. """
  3942. Return boolean Series denoting duplicate rows, optionally only
  3943. considering certain columns.
  3944. Parameters
  3945. ----------
  3946. subset : column label or sequence of labels, optional
  3947. Only consider certain columns for identifying duplicates, by
  3948. default use all of the columns
  3949. keep : {'first', 'last', False}, default 'first'
  3950. - ``first`` : Mark duplicates as ``True`` except for the
  3951. first occurrence.
  3952. - ``last`` : Mark duplicates as ``True`` except for the
  3953. last occurrence.
  3954. - False : Mark all duplicates as ``True``.
  3955. Returns
  3956. -------
  3957. Series
  3958. """
  3959. from pandas.core.sorting import get_group_index
  3960. from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
  3961. if self.empty:
  3962. return Series(dtype=bool)
  3963. def f(vals):
  3964. labels, shape = algorithms.factorize(
  3965. vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
  3966. return labels.astype('i8', copy=False), len(shape)
  3967. if subset is None:
  3968. subset = self.columns
  3969. elif (not np.iterable(subset) or
  3970. isinstance(subset, compat.string_types) or
  3971. isinstance(subset, tuple) and subset in self.columns):
  3972. subset = subset,
  3973. # Verify all columns in subset exist in the queried dataframe
  3974. # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
  3975. # key that doesn't exist.
  3976. diff = Index(subset).difference(self.columns)
  3977. if not diff.empty:
  3978. raise KeyError(diff)
  3979. vals = (col.values for name, col in self.iteritems()
  3980. if name in subset)
  3981. labels, shape = map(list, zip(*map(f, vals)))
  3982. ids = get_group_index(labels, shape, sort=False, xnull=False)
  3983. return Series(duplicated_int64(ids, keep), index=self.index)
  3984. # ----------------------------------------------------------------------
  3985. # Sorting
  3986. @Substitution(**_shared_doc_kwargs)
  3987. @Appender(NDFrame.sort_values.__doc__)
  3988. def sort_values(self, by, axis=0, ascending=True, inplace=False,
  3989. kind='quicksort', na_position='last'):
  3990. inplace = validate_bool_kwarg(inplace, 'inplace')
  3991. axis = self._get_axis_number(axis)
  3992. if not isinstance(by, list):
  3993. by = [by]
  3994. if is_sequence(ascending) and len(by) != len(ascending):
  3995. raise ValueError('Length of ascending (%d) != length of by (%d)' %
  3996. (len(ascending), len(by)))
  3997. if len(by) > 1:
  3998. from pandas.core.sorting import lexsort_indexer
  3999. keys = [self._get_label_or_level_values(x, axis=axis)
  4000. for x in by]
  4001. indexer = lexsort_indexer(keys, orders=ascending,
  4002. na_position=na_position)
  4003. indexer = ensure_platform_int(indexer)
  4004. else:
  4005. from pandas.core.sorting import nargsort
  4006. by = by[0]
  4007. k = self._get_label_or_level_values(by, axis=axis)
  4008. if isinstance(ascending, (tuple, list)):
  4009. ascending = ascending[0]
  4010. indexer = nargsort(k, kind=kind, ascending=ascending,
  4011. na_position=na_position)
  4012. new_data = self._data.take(indexer,
  4013. axis=self._get_block_manager_axis(axis),
  4014. verify=False)
  4015. if inplace:
  4016. return self._update_inplace(new_data)
  4017. else:
  4018. return self._constructor(new_data).__finalize__(self)
  4019. @Substitution(**_shared_doc_kwargs)
  4020. @Appender(NDFrame.sort_index.__doc__)
  4021. def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
  4022. kind='quicksort', na_position='last', sort_remaining=True,
  4023. by=None):
  4024. # TODO: this can be combined with Series.sort_index impl as
  4025. # almost identical
  4026. inplace = validate_bool_kwarg(inplace, 'inplace')
  4027. # 10726
  4028. if by is not None:
  4029. warnings.warn("by argument to sort_index is deprecated, "
  4030. "please use .sort_values(by=...)",
  4031. FutureWarning, stacklevel=2)
  4032. if level is not None:
  4033. raise ValueError("unable to simultaneously sort by and level")
  4034. return self.sort_values(by, axis=axis, ascending=ascending,
  4035. inplace=inplace)
  4036. axis = self._get_axis_number(axis)
  4037. labels = self._get_axis(axis)
  4038. # make sure that the axis is lexsorted to start
  4039. # if not we need to reconstruct to get the correct indexer
  4040. labels = labels._sort_levels_monotonic()
  4041. if level is not None:
  4042. new_axis, indexer = labels.sortlevel(level, ascending=ascending,
  4043. sort_remaining=sort_remaining)
  4044. elif isinstance(labels, MultiIndex):
  4045. from pandas.core.sorting import lexsort_indexer
  4046. indexer = lexsort_indexer(labels._get_codes_for_sorting(),
  4047. orders=ascending,
  4048. na_position=na_position)
  4049. else:
  4050. from pandas.core.sorting import nargsort
  4051. # Check monotonic-ness before sort an index
  4052. # GH11080
  4053. if ((ascending and labels.is_monotonic_increasing) or
  4054. (not ascending and labels.is_monotonic_decreasing)):
  4055. if inplace:
  4056. return
  4057. else:
  4058. return self.copy()
  4059. indexer = nargsort(labels, kind=kind, ascending=ascending,
  4060. na_position=na_position)
  4061. baxis = self._get_block_manager_axis(axis)
  4062. new_data = self._data.take(indexer,
  4063. axis=baxis,
  4064. verify=False)
  4065. # reconstruct axis if needed
  4066. new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
  4067. if inplace:
  4068. return self._update_inplace(new_data)
  4069. else:
  4070. return self._constructor(new_data).__finalize__(self)
  4071. def nlargest(self, n, columns, keep='first'):
  4072. """
  4073. Return the first `n` rows ordered by `columns` in descending order.
  4074. Return the first `n` rows with the largest values in `columns`, in
  4075. descending order. The columns that are not specified are returned as
  4076. well, but not used for ordering.
  4077. This method is equivalent to
  4078. ``df.sort_values(columns, ascending=False).head(n)``, but more
  4079. performant.
  4080. Parameters
  4081. ----------
  4082. n : int
  4083. Number of rows to return.
  4084. columns : label or list of labels
  4085. Column label(s) to order by.
  4086. keep : {'first', 'last', 'all'}, default 'first'
  4087. Where there are duplicate values:
  4088. - `first` : prioritize the first occurrence(s)
  4089. - `last` : prioritize the last occurrence(s)
  4090. - ``all`` : do not drop any duplicates, even it means
  4091. selecting more than `n` items.
  4092. .. versionadded:: 0.24.0
  4093. Returns
  4094. -------
  4095. DataFrame
  4096. The first `n` rows ordered by the given columns in descending
  4097. order.
  4098. See Also
  4099. --------
  4100. DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
  4101. ascending order.
  4102. DataFrame.sort_values : Sort DataFrame by the values.
  4103. DataFrame.head : Return the first `n` rows without re-ordering.
  4104. Notes
  4105. -----
  4106. This function cannot be used with all column types. For example, when
  4107. specifying columns with `object` or `category` dtypes, ``TypeError`` is
  4108. raised.
  4109. Examples
  4110. --------
  4111. >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
  4112. ... 434000, 434000, 337000, 11300,
  4113. ... 11300, 11300],
  4114. ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
  4115. ... 17036, 182, 38, 311],
  4116. ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
  4117. ... "IS", "NR", "TV", "AI"]},
  4118. ... index=["Italy", "France", "Malta",
  4119. ... "Maldives", "Brunei", "Iceland",
  4120. ... "Nauru", "Tuvalu", "Anguilla"])
  4121. >>> df
  4122. population GDP alpha-2
  4123. Italy 59000000 1937894 IT
  4124. France 65000000 2583560 FR
  4125. Malta 434000 12011 MT
  4126. Maldives 434000 4520 MV
  4127. Brunei 434000 12128 BN
  4128. Iceland 337000 17036 IS
  4129. Nauru 11300 182 NR
  4130. Tuvalu 11300 38 TV
  4131. Anguilla 11300 311 AI
  4132. In the following example, we will use ``nlargest`` to select the three
  4133. rows having the largest values in column "population".
  4134. >>> df.nlargest(3, 'population')
  4135. population GDP alpha-2
  4136. France 65000000 2583560 FR
  4137. Italy 59000000 1937894 IT
  4138. Malta 434000 12011 MT
  4139. When using ``keep='last'``, ties are resolved in reverse order:
  4140. >>> df.nlargest(3, 'population', keep='last')
  4141. population GDP alpha-2
  4142. France 65000000 2583560 FR
  4143. Italy 59000000 1937894 IT
  4144. Brunei 434000 12128 BN
  4145. When using ``keep='all'``, all duplicate items are maintained:
  4146. >>> df.nlargest(3, 'population', keep='all')
  4147. population GDP alpha-2
  4148. France 65000000 2583560 FR
  4149. Italy 59000000 1937894 IT
  4150. Malta 434000 12011 MT
  4151. Maldives 434000 4520 MV
  4152. Brunei 434000 12128 BN
  4153. To order by the largest values in column "population" and then "GDP",
  4154. we can specify multiple columns like in the next example.
  4155. >>> df.nlargest(3, ['population', 'GDP'])
  4156. population GDP alpha-2
  4157. France 65000000 2583560 FR
  4158. Italy 59000000 1937894 IT
  4159. Brunei 434000 12128 BN
  4160. """
  4161. return algorithms.SelectNFrame(self,
  4162. n=n,
  4163. keep=keep,
  4164. columns=columns).nlargest()
  4165. def nsmallest(self, n, columns, keep='first'):
  4166. """
  4167. Return the first `n` rows ordered by `columns` in ascending order.
  4168. Return the first `n` rows with the smallest values in `columns`, in
  4169. ascending order. The columns that are not specified are returned as
  4170. well, but not used for ordering.
  4171. This method is equivalent to
  4172. ``df.sort_values(columns, ascending=True).head(n)``, but more
  4173. performant.
  4174. Parameters
  4175. ----------
  4176. n : int
  4177. Number of items to retrieve.
  4178. columns : list or str
  4179. Column name or names to order by.
  4180. keep : {'first', 'last', 'all'}, default 'first'
  4181. Where there are duplicate values:
  4182. - ``first`` : take the first occurrence.
  4183. - ``last`` : take the last occurrence.
  4184. - ``all`` : do not drop any duplicates, even it means
  4185. selecting more than `n` items.
  4186. .. versionadded:: 0.24.0
  4187. Returns
  4188. -------
  4189. DataFrame
  4190. See Also
  4191. --------
  4192. DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
  4193. descending order.
  4194. DataFrame.sort_values : Sort DataFrame by the values.
  4195. DataFrame.head : Return the first `n` rows without re-ordering.
  4196. Examples
  4197. --------
  4198. >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
  4199. ... 434000, 434000, 337000, 11300,
  4200. ... 11300, 11300],
  4201. ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
  4202. ... 17036, 182, 38, 311],
  4203. ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
  4204. ... "IS", "NR", "TV", "AI"]},
  4205. ... index=["Italy", "France", "Malta",
  4206. ... "Maldives", "Brunei", "Iceland",
  4207. ... "Nauru", "Tuvalu", "Anguilla"])
  4208. >>> df
  4209. population GDP alpha-2
  4210. Italy 59000000 1937894 IT
  4211. France 65000000 2583560 FR
  4212. Malta 434000 12011 MT
  4213. Maldives 434000 4520 MV
  4214. Brunei 434000 12128 BN
  4215. Iceland 337000 17036 IS
  4216. Nauru 11300 182 NR
  4217. Tuvalu 11300 38 TV
  4218. Anguilla 11300 311 AI
  4219. In the following example, we will use ``nsmallest`` to select the
  4220. three rows having the smallest values in column "a".
  4221. >>> df.nsmallest(3, 'population')
  4222. population GDP alpha-2
  4223. Nauru 11300 182 NR
  4224. Tuvalu 11300 38 TV
  4225. Anguilla 11300 311 AI
  4226. When using ``keep='last'``, ties are resolved in reverse order:
  4227. >>> df.nsmallest(3, 'population', keep='last')
  4228. population GDP alpha-2
  4229. Anguilla 11300 311 AI
  4230. Tuvalu 11300 38 TV
  4231. Nauru 11300 182 NR
  4232. When using ``keep='all'``, all duplicate items are maintained:
  4233. >>> df.nsmallest(3, 'population', keep='all')
  4234. population GDP alpha-2
  4235. Nauru 11300 182 NR
  4236. Tuvalu 11300 38 TV
  4237. Anguilla 11300 311 AI
  4238. To order by the largest values in column "a" and then "c", we can
  4239. specify multiple columns like in the next example.
  4240. >>> df.nsmallest(3, ['population', 'GDP'])
  4241. population GDP alpha-2
  4242. Tuvalu 11300 38 TV
  4243. Nauru 11300 182 NR
  4244. Anguilla 11300 311 AI
  4245. """
  4246. return algorithms.SelectNFrame(self,
  4247. n=n,
  4248. keep=keep,
  4249. columns=columns).nsmallest()
  4250. def swaplevel(self, i=-2, j=-1, axis=0):
  4251. """
  4252. Swap levels i and j in a MultiIndex on a particular axis.
  4253. Parameters
  4254. ----------
  4255. i, j : int, string (can be mixed)
  4256. Level of index to be swapped. Can pass level name as string.
  4257. Returns
  4258. -------
  4259. DataFrame
  4260. .. versionchanged:: 0.18.1
  4261. The indexes ``i`` and ``j`` are now optional, and default to
  4262. the two innermost levels of the index.
  4263. """
  4264. result = self.copy()
  4265. axis = self._get_axis_number(axis)
  4266. if axis == 0:
  4267. result.index = result.index.swaplevel(i, j)
  4268. else:
  4269. result.columns = result.columns.swaplevel(i, j)
  4270. return result
  4271. def reorder_levels(self, order, axis=0):
  4272. """
  4273. Rearrange index levels using input order. May not drop or
  4274. duplicate levels.
  4275. Parameters
  4276. ----------
  4277. order : list of int or list of str
  4278. List representing new level order. Reference level by number
  4279. (position) or by key (label).
  4280. axis : int
  4281. Where to reorder levels.
  4282. Returns
  4283. -------
  4284. type of caller (new object)
  4285. """
  4286. axis = self._get_axis_number(axis)
  4287. if not isinstance(self._get_axis(axis),
  4288. MultiIndex): # pragma: no cover
  4289. raise TypeError('Can only reorder levels on a hierarchical axis.')
  4290. result = self.copy()
  4291. if axis == 0:
  4292. result.index = result.index.reorder_levels(order)
  4293. else:
  4294. result.columns = result.columns.reorder_levels(order)
  4295. return result
  4296. # ----------------------------------------------------------------------
  4297. # Arithmetic / combination related
  4298. def _combine_frame(self, other, func, fill_value=None, level=None):
  4299. this, other = self.align(other, join='outer', level=level, copy=False)
  4300. new_index, new_columns = this.index, this.columns
  4301. def _arith_op(left, right):
  4302. # for the mixed_type case where we iterate over columns,
  4303. # _arith_op(left, right) is equivalent to
  4304. # left._binop(right, func, fill_value=fill_value)
  4305. left, right = ops.fill_binop(left, right, fill_value)
  4306. return func(left, right)
  4307. if ops.should_series_dispatch(this, other, func):
  4308. # iterate over columns
  4309. return ops.dispatch_to_series(this, other, _arith_op)
  4310. else:
  4311. result = _arith_op(this.values, other.values)
  4312. return self._constructor(result,
  4313. index=new_index, columns=new_columns,
  4314. copy=False)
  4315. def _combine_match_index(self, other, func, level=None):
  4316. left, right = self.align(other, join='outer', axis=0, level=level,
  4317. copy=False)
  4318. assert left.index.equals(right.index)
  4319. if left._is_mixed_type or right._is_mixed_type:
  4320. # operate column-wise; avoid costly object-casting in `.values`
  4321. return ops.dispatch_to_series(left, right, func)
  4322. else:
  4323. # fastpath --> operate directly on values
  4324. with np.errstate(all="ignore"):
  4325. new_data = func(left.values.T, right.values).T
  4326. return self._constructor(new_data,
  4327. index=left.index, columns=self.columns,
  4328. copy=False)
  4329. def _combine_match_columns(self, other, func, level=None):
  4330. assert isinstance(other, Series)
  4331. left, right = self.align(other, join='outer', axis=1, level=level,
  4332. copy=False)
  4333. assert left.columns.equals(right.index)
  4334. return ops.dispatch_to_series(left, right, func, axis="columns")
  4335. def _combine_const(self, other, func):
  4336. assert lib.is_scalar(other) or np.ndim(other) == 0
  4337. return ops.dispatch_to_series(self, other, func)
  4338. def combine(self, other, func, fill_value=None, overwrite=True):
  4339. """
  4340. Perform column-wise combine with another DataFrame.
  4341. Combines a DataFrame with `other` DataFrame using `func`
  4342. to element-wise combine columns. The row and column indexes of the
  4343. resulting DataFrame will be the union of the two.
  4344. Parameters
  4345. ----------
  4346. other : DataFrame
  4347. The DataFrame to merge column-wise.
  4348. func : function
  4349. Function that takes two series as inputs and return a Series or a
  4350. scalar. Used to merge the two dataframes column by columns.
  4351. fill_value : scalar value, default None
  4352. The value to fill NaNs with prior to passing any column to the
  4353. merge func.
  4354. overwrite : bool, default True
  4355. If True, columns in `self` that do not exist in `other` will be
  4356. overwritten with NaNs.
  4357. Returns
  4358. -------
  4359. DataFrame
  4360. Combination of the provided DataFrames.
  4361. See Also
  4362. --------
  4363. DataFrame.combine_first : Combine two DataFrame objects and default to
  4364. non-null values in frame calling the method.
  4365. Examples
  4366. --------
  4367. Combine using a simple function that chooses the smaller column.
  4368. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
  4369. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4370. >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
  4371. >>> df1.combine(df2, take_smaller)
  4372. A B
  4373. 0 0 3
  4374. 1 0 3
  4375. Example using a true element-wise combine function.
  4376. >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
  4377. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4378. >>> df1.combine(df2, np.minimum)
  4379. A B
  4380. 0 1 2
  4381. 1 0 3
  4382. Using `fill_value` fills Nones prior to passing the column to the
  4383. merge function.
  4384. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
  4385. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4386. >>> df1.combine(df2, take_smaller, fill_value=-5)
  4387. A B
  4388. 0 0 -5.0
  4389. 1 0 4.0
  4390. However, if the same element in both dataframes is None, that None
  4391. is preserved
  4392. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
  4393. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
  4394. >>> df1.combine(df2, take_smaller, fill_value=-5)
  4395. A B
  4396. 0 0 -5.0
  4397. 1 0 3.0
  4398. Example that demonstrates the use of `overwrite` and behavior when
  4399. the axis differ between the dataframes.
  4400. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
  4401. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
  4402. >>> df1.combine(df2, take_smaller)
  4403. A B C
  4404. 0 NaN NaN NaN
  4405. 1 NaN 3.0 -10.0
  4406. 2 NaN 3.0 1.0
  4407. >>> df1.combine(df2, take_smaller, overwrite=False)
  4408. A B C
  4409. 0 0.0 NaN NaN
  4410. 1 0.0 3.0 -10.0
  4411. 2 NaN 3.0 1.0
  4412. Demonstrating the preference of the passed in dataframe.
  4413. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
  4414. >>> df2.combine(df1, take_smaller)
  4415. A B C
  4416. 0 0.0 NaN NaN
  4417. 1 0.0 3.0 NaN
  4418. 2 NaN 3.0 NaN
  4419. >>> df2.combine(df1, take_smaller, overwrite=False)
  4420. A B C
  4421. 0 0.0 NaN NaN
  4422. 1 0.0 3.0 1.0
  4423. 2 NaN 3.0 1.0
  4424. """
  4425. other_idxlen = len(other.index) # save for compare
  4426. this, other = self.align(other, copy=False)
  4427. new_index = this.index
  4428. if other.empty and len(new_index) == len(self.index):
  4429. return self.copy()
  4430. if self.empty and len(other) == other_idxlen:
  4431. return other.copy()
  4432. # sorts if possible
  4433. new_columns = this.columns.union(other.columns)
  4434. do_fill = fill_value is not None
  4435. result = {}
  4436. for col in new_columns:
  4437. series = this[col]
  4438. otherSeries = other[col]
  4439. this_dtype = series.dtype
  4440. other_dtype = otherSeries.dtype
  4441. this_mask = isna(series)
  4442. other_mask = isna(otherSeries)
  4443. # don't overwrite columns unecessarily
  4444. # DO propagate if this column is not in the intersection
  4445. if not overwrite and other_mask.all():
  4446. result[col] = this[col].copy()
  4447. continue
  4448. if do_fill:
  4449. series = series.copy()
  4450. otherSeries = otherSeries.copy()
  4451. series[this_mask] = fill_value
  4452. otherSeries[other_mask] = fill_value
  4453. if col not in self.columns:
  4454. # If self DataFrame does not have col in other DataFrame,
  4455. # try to promote series, which is all NaN, as other_dtype.
  4456. new_dtype = other_dtype
  4457. try:
  4458. series = series.astype(new_dtype, copy=False)
  4459. except ValueError:
  4460. # e.g. new_dtype is integer types
  4461. pass
  4462. else:
  4463. # if we have different dtypes, possibly promote
  4464. new_dtype = find_common_type([this_dtype, other_dtype])
  4465. if not is_dtype_equal(this_dtype, new_dtype):
  4466. series = series.astype(new_dtype)
  4467. if not is_dtype_equal(other_dtype, new_dtype):
  4468. otherSeries = otherSeries.astype(new_dtype)
  4469. arr = func(series, otherSeries)
  4470. arr = maybe_downcast_to_dtype(arr, this_dtype)
  4471. result[col] = arr
  4472. # convert_objects just in case
  4473. return self._constructor(result, index=new_index,
  4474. columns=new_columns)
  4475. def combine_first(self, other):
  4476. """
  4477. Update null elements with value in the same location in `other`.
  4478. Combine two DataFrame objects by filling null values in one DataFrame
  4479. with non-null values from other DataFrame. The row and column indexes
  4480. of the resulting DataFrame will be the union of the two.
  4481. Parameters
  4482. ----------
  4483. other : DataFrame
  4484. Provided DataFrame to use to fill null values.
  4485. Returns
  4486. -------
  4487. DataFrame
  4488. See Also
  4489. --------
  4490. DataFrame.combine : Perform series-wise operation on two DataFrames
  4491. using a given function.
  4492. Examples
  4493. --------
  4494. >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
  4495. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4496. >>> df1.combine_first(df2)
  4497. A B
  4498. 0 1.0 3.0
  4499. 1 0.0 4.0
  4500. Null values still persist if the location of that null value
  4501. does not exist in `other`
  4502. >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
  4503. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
  4504. >>> df1.combine_first(df2)
  4505. A B C
  4506. 0 NaN 4.0 NaN
  4507. 1 0.0 3.0 1.0
  4508. 2 NaN 3.0 1.0
  4509. """
  4510. import pandas.core.computation.expressions as expressions
  4511. def extract_values(arr):
  4512. # Does two things:
  4513. # 1. maybe gets the values from the Series / Index
  4514. # 2. convert datelike to i8
  4515. if isinstance(arr, (ABCIndexClass, ABCSeries)):
  4516. arr = arr._values
  4517. if needs_i8_conversion(arr):
  4518. if is_extension_array_dtype(arr.dtype):
  4519. arr = arr.asi8
  4520. else:
  4521. arr = arr.view('i8')
  4522. return arr
  4523. def combiner(x, y):
  4524. mask = isna(x)
  4525. if isinstance(mask, (ABCIndexClass, ABCSeries)):
  4526. mask = mask._values
  4527. x_values = extract_values(x)
  4528. y_values = extract_values(y)
  4529. # If the column y in other DataFrame is not in first DataFrame,
  4530. # just return y_values.
  4531. if y.name not in self.columns:
  4532. return y_values
  4533. return expressions.where(mask, y_values, x_values)
  4534. return self.combine(other, combiner, overwrite=False)
  4535. @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors',
  4536. mapping={False: 'ignore', True: 'raise'})
  4537. def update(self, other, join='left', overwrite=True, filter_func=None,
  4538. errors='ignore'):
  4539. """
  4540. Modify in place using non-NA values from another DataFrame.
  4541. Aligns on indices. There is no return value.
  4542. Parameters
  4543. ----------
  4544. other : DataFrame, or object coercible into a DataFrame
  4545. Should have at least one matching index/column label
  4546. with the original DataFrame. If a Series is passed,
  4547. its name attribute must be set, and that will be
  4548. used as the column name to align with the original DataFrame.
  4549. join : {'left'}, default 'left'
  4550. Only left join is implemented, keeping the index and columns of the
  4551. original object.
  4552. overwrite : bool, default True
  4553. How to handle non-NA values for overlapping keys:
  4554. * True: overwrite original DataFrame's values
  4555. with values from `other`.
  4556. * False: only update values that are NA in
  4557. the original DataFrame.
  4558. filter_func : callable(1d-array) -> bool 1d-array, optional
  4559. Can choose to replace values other than NA. Return True for values
  4560. that should be updated.
  4561. errors : {'raise', 'ignore'}, default 'ignore'
  4562. If 'raise', will raise a ValueError if the DataFrame and `other`
  4563. both contain non-NA data in the same place.
  4564. .. versionchanged :: 0.24.0
  4565. Changed from `raise_conflict=False|True`
  4566. to `errors='ignore'|'raise'`.
  4567. Returns
  4568. -------
  4569. None : method directly changes calling object
  4570. Raises
  4571. ------
  4572. ValueError
  4573. * When `errors='raise'` and there's overlapping non-NA data.
  4574. * When `errors` is not either `'ignore'` or `'raise'`
  4575. NotImplementedError
  4576. * If `join != 'left'`
  4577. See Also
  4578. --------
  4579. dict.update : Similar method for dictionaries.
  4580. DataFrame.merge : For column(s)-on-columns(s) operations.
  4581. Examples
  4582. --------
  4583. >>> df = pd.DataFrame({'A': [1, 2, 3],
  4584. ... 'B': [400, 500, 600]})
  4585. >>> new_df = pd.DataFrame({'B': [4, 5, 6],
  4586. ... 'C': [7, 8, 9]})
  4587. >>> df.update(new_df)
  4588. >>> df
  4589. A B
  4590. 0 1 4
  4591. 1 2 5
  4592. 2 3 6
  4593. The DataFrame's length does not increase as a result of the update,
  4594. only values at matching index/column labels are updated.
  4595. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  4596. ... 'B': ['x', 'y', 'z']})
  4597. >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
  4598. >>> df.update(new_df)
  4599. >>> df
  4600. A B
  4601. 0 a d
  4602. 1 b e
  4603. 2 c f
  4604. For Series, it's name attribute must be set.
  4605. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  4606. ... 'B': ['x', 'y', 'z']})
  4607. >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
  4608. >>> df.update(new_column)
  4609. >>> df
  4610. A B
  4611. 0 a d
  4612. 1 b y
  4613. 2 c e
  4614. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  4615. ... 'B': ['x', 'y', 'z']})
  4616. >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
  4617. >>> df.update(new_df)
  4618. >>> df
  4619. A B
  4620. 0 a x
  4621. 1 b d
  4622. 2 c e
  4623. If `other` contains NaNs the corresponding values are not updated
  4624. in the original dataframe.
  4625. >>> df = pd.DataFrame({'A': [1, 2, 3],
  4626. ... 'B': [400, 500, 600]})
  4627. >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
  4628. >>> df.update(new_df)
  4629. >>> df
  4630. A B
  4631. 0 1 4.0
  4632. 1 2 500.0
  4633. 2 3 6.0
  4634. """
  4635. import pandas.core.computation.expressions as expressions
  4636. # TODO: Support other joins
  4637. if join != 'left': # pragma: no cover
  4638. raise NotImplementedError("Only left join is supported")
  4639. if errors not in ['ignore', 'raise']:
  4640. raise ValueError("The parameter errors must be either "
  4641. "'ignore' or 'raise'")
  4642. if not isinstance(other, DataFrame):
  4643. other = DataFrame(other)
  4644. other = other.reindex_like(self)
  4645. for col in self.columns:
  4646. this = self[col].values
  4647. that = other[col].values
  4648. if filter_func is not None:
  4649. with np.errstate(all='ignore'):
  4650. mask = ~filter_func(this) | isna(that)
  4651. else:
  4652. if errors == 'raise':
  4653. mask_this = notna(that)
  4654. mask_that = notna(this)
  4655. if any(mask_this & mask_that):
  4656. raise ValueError("Data overlaps.")
  4657. if overwrite:
  4658. mask = isna(that)
  4659. else:
  4660. mask = notna(this)
  4661. # don't overwrite columns unecessarily
  4662. if mask.all():
  4663. continue
  4664. self[col] = expressions.where(mask, this, that)
  4665. # ----------------------------------------------------------------------
  4666. # Data reshaping
  4667. _shared_docs['pivot'] = """
  4668. Return reshaped DataFrame organized by given index / column values.
  4669. Reshape data (produce a "pivot" table) based on column values. Uses
  4670. unique values from specified `index` / `columns` to form axes of the
  4671. resulting DataFrame. This function does not support data
  4672. aggregation, multiple values will result in a MultiIndex in the
  4673. columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
  4674. Parameters
  4675. ----------%s
  4676. index : string or object, optional
  4677. Column to use to make new frame's index. If None, uses
  4678. existing index.
  4679. columns : string or object
  4680. Column to use to make new frame's columns.
  4681. values : string, object or a list of the previous, optional
  4682. Column(s) to use for populating new frame's values. If not
  4683. specified, all remaining columns will be used and the result will
  4684. have hierarchically indexed columns.
  4685. .. versionchanged :: 0.23.0
  4686. Also accept list of column names.
  4687. Returns
  4688. -------
  4689. DataFrame
  4690. Returns reshaped DataFrame.
  4691. Raises
  4692. ------
  4693. ValueError:
  4694. When there are any `index`, `columns` combinations with multiple
  4695. values. `DataFrame.pivot_table` when you need to aggregate.
  4696. See Also
  4697. --------
  4698. DataFrame.pivot_table : Generalization of pivot that can handle
  4699. duplicate values for one index/column pair.
  4700. DataFrame.unstack : Pivot based on the index values instead of a
  4701. column.
  4702. Notes
  4703. -----
  4704. For finer-tuned control, see hierarchical indexing documentation along
  4705. with the related stack/unstack methods.
  4706. Examples
  4707. --------
  4708. >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
  4709. ... 'two'],
  4710. ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
  4711. ... 'baz': [1, 2, 3, 4, 5, 6],
  4712. ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
  4713. >>> df
  4714. foo bar baz zoo
  4715. 0 one A 1 x
  4716. 1 one B 2 y
  4717. 2 one C 3 z
  4718. 3 two A 4 q
  4719. 4 two B 5 w
  4720. 5 two C 6 t
  4721. >>> df.pivot(index='foo', columns='bar', values='baz')
  4722. bar A B C
  4723. foo
  4724. one 1 2 3
  4725. two 4 5 6
  4726. >>> df.pivot(index='foo', columns='bar')['baz']
  4727. bar A B C
  4728. foo
  4729. one 1 2 3
  4730. two 4 5 6
  4731. >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
  4732. baz zoo
  4733. bar A B C A B C
  4734. foo
  4735. one 1 2 3 x y z
  4736. two 4 5 6 q w t
  4737. A ValueError is raised if there are any duplicates.
  4738. >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
  4739. ... "bar": ['A', 'A', 'B', 'C'],
  4740. ... "baz": [1, 2, 3, 4]})
  4741. >>> df
  4742. foo bar baz
  4743. 0 one A 1
  4744. 1 one A 2
  4745. 2 two B 3
  4746. 3 two C 4
  4747. Notice that the first two rows are the same for our `index`
  4748. and `columns` arguments.
  4749. >>> df.pivot(index='foo', columns='bar', values='baz')
  4750. Traceback (most recent call last):
  4751. ...
  4752. ValueError: Index contains duplicate entries, cannot reshape
  4753. """
  4754. @Substitution('')
  4755. @Appender(_shared_docs['pivot'])
  4756. def pivot(self, index=None, columns=None, values=None):
  4757. from pandas.core.reshape.pivot import pivot
  4758. return pivot(self, index=index, columns=columns, values=values)
  4759. _shared_docs['pivot_table'] = """
  4760. Create a spreadsheet-style pivot table as a DataFrame. The levels in
  4761. the pivot table will be stored in MultiIndex objects (hierarchical
  4762. indexes) on the index and columns of the result DataFrame.
  4763. Parameters
  4764. ----------%s
  4765. values : column to aggregate, optional
  4766. index : column, Grouper, array, or list of the previous
  4767. If an array is passed, it must be the same length as the data. The
  4768. list can contain any of the other types (except list).
  4769. Keys to group by on the pivot table index. If an array is passed,
  4770. it is being used as the same manner as column values.
  4771. columns : column, Grouper, array, or list of the previous
  4772. If an array is passed, it must be the same length as the data. The
  4773. list can contain any of the other types (except list).
  4774. Keys to group by on the pivot table column. If an array is passed,
  4775. it is being used as the same manner as column values.
  4776. aggfunc : function, list of functions, dict, default numpy.mean
  4777. If list of functions passed, the resulting pivot table will have
  4778. hierarchical columns whose top level are the function names
  4779. (inferred from the function objects themselves)
  4780. If dict is passed, the key is column to aggregate and value
  4781. is function or list of functions
  4782. fill_value : scalar, default None
  4783. Value to replace missing values with
  4784. margins : boolean, default False
  4785. Add all row / columns (e.g. for subtotal / grand totals)
  4786. dropna : boolean, default True
  4787. Do not include columns whose entries are all NaN
  4788. margins_name : string, default 'All'
  4789. Name of the row / column that will contain the totals
  4790. when margins is True.
  4791. Returns
  4792. -------
  4793. DataFrame
  4794. See Also
  4795. --------
  4796. DataFrame.pivot : Pivot without aggregation that can handle
  4797. non-numeric data.
  4798. Examples
  4799. --------
  4800. >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
  4801. ... "bar", "bar", "bar", "bar"],
  4802. ... "B": ["one", "one", "one", "two", "two",
  4803. ... "one", "one", "two", "two"],
  4804. ... "C": ["small", "large", "large", "small",
  4805. ... "small", "large", "small", "small",
  4806. ... "large"],
  4807. ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
  4808. ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
  4809. >>> df
  4810. A B C D E
  4811. 0 foo one small 1 2
  4812. 1 foo one large 2 4
  4813. 2 foo one large 2 5
  4814. 3 foo two small 3 5
  4815. 4 foo two small 3 6
  4816. 5 bar one large 4 6
  4817. 6 bar one small 5 8
  4818. 7 bar two small 6 9
  4819. 8 bar two large 7 9
  4820. This first example aggregates values by taking the sum.
  4821. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
  4822. ... columns=['C'], aggfunc=np.sum)
  4823. >>> table
  4824. C large small
  4825. A B
  4826. bar one 4.0 5.0
  4827. two 7.0 6.0
  4828. foo one 4.0 1.0
  4829. two NaN 6.0
  4830. We can also fill missing values using the `fill_value` parameter.
  4831. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
  4832. ... columns=['C'], aggfunc=np.sum, fill_value=0)
  4833. >>> table
  4834. C large small
  4835. A B
  4836. bar one 4 5
  4837. two 7 6
  4838. foo one 4 1
  4839. two 0 6
  4840. The next example aggregates by taking the mean across multiple columns.
  4841. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
  4842. ... aggfunc={'D': np.mean,
  4843. ... 'E': np.mean})
  4844. >>> table
  4845. D E
  4846. A C
  4847. bar large 5.500000 7.500000
  4848. small 5.500000 8.500000
  4849. foo large 2.000000 4.500000
  4850. small 2.333333 4.333333
  4851. We can also calculate multiple types of aggregations for any given
  4852. value column.
  4853. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
  4854. ... aggfunc={'D': np.mean,
  4855. ... 'E': [min, max, np.mean]})
  4856. >>> table
  4857. D E
  4858. mean max mean min
  4859. A C
  4860. bar large 5.500000 9.0 7.500000 6.0
  4861. small 5.500000 9.0 8.500000 8.0
  4862. foo large 2.000000 5.0 4.500000 4.0
  4863. small 2.333333 6.0 4.333333 2.0
  4864. """
  4865. @Substitution('')
  4866. @Appender(_shared_docs['pivot_table'])
  4867. def pivot_table(self, values=None, index=None, columns=None,
  4868. aggfunc='mean', fill_value=None, margins=False,
  4869. dropna=True, margins_name='All'):
  4870. from pandas.core.reshape.pivot import pivot_table
  4871. return pivot_table(self, values=values, index=index, columns=columns,
  4872. aggfunc=aggfunc, fill_value=fill_value,
  4873. margins=margins, dropna=dropna,
  4874. margins_name=margins_name)
  4875. def stack(self, level=-1, dropna=True):
  4876. """
  4877. Stack the prescribed level(s) from columns to index.
  4878. Return a reshaped DataFrame or Series having a multi-level
  4879. index with one or more new inner-most levels compared to the current
  4880. DataFrame. The new inner-most levels are created by pivoting the
  4881. columns of the current dataframe:
  4882. - if the columns have a single level, the output is a Series;
  4883. - if the columns have multiple levels, the new index
  4884. level(s) is (are) taken from the prescribed level(s) and
  4885. the output is a DataFrame.
  4886. The new index levels are sorted.
  4887. Parameters
  4888. ----------
  4889. level : int, str, list, default -1
  4890. Level(s) to stack from the column axis onto the index
  4891. axis, defined as one index or label, or a list of indices
  4892. or labels.
  4893. dropna : bool, default True
  4894. Whether to drop rows in the resulting Frame/Series with
  4895. missing values. Stacking a column level onto the index
  4896. axis can create combinations of index and column values
  4897. that are missing from the original dataframe. See Examples
  4898. section.
  4899. Returns
  4900. -------
  4901. DataFrame or Series
  4902. Stacked dataframe or series.
  4903. See Also
  4904. --------
  4905. DataFrame.unstack : Unstack prescribed level(s) from index axis
  4906. onto column axis.
  4907. DataFrame.pivot : Reshape dataframe from long format to wide
  4908. format.
  4909. DataFrame.pivot_table : Create a spreadsheet-style pivot table
  4910. as a DataFrame.
  4911. Notes
  4912. -----
  4913. The function is named by analogy with a collection of books
  4914. being reorganized from being side by side on a horizontal
  4915. position (the columns of the dataframe) to being stacked
  4916. vertically on top of each other (in the index of the
  4917. dataframe).
  4918. Examples
  4919. --------
  4920. **Single level columns**
  4921. >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
  4922. ... index=['cat', 'dog'],
  4923. ... columns=['weight', 'height'])
  4924. Stacking a dataframe with a single level column axis returns a Series:
  4925. >>> df_single_level_cols
  4926. weight height
  4927. cat 0 1
  4928. dog 2 3
  4929. >>> df_single_level_cols.stack()
  4930. cat weight 0
  4931. height 1
  4932. dog weight 2
  4933. height 3
  4934. dtype: int64
  4935. **Multi level columns: simple case**
  4936. >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
  4937. ... ('weight', 'pounds')])
  4938. >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
  4939. ... index=['cat', 'dog'],
  4940. ... columns=multicol1)
  4941. Stacking a dataframe with a multi-level column axis:
  4942. >>> df_multi_level_cols1
  4943. weight
  4944. kg pounds
  4945. cat 1 2
  4946. dog 2 4
  4947. >>> df_multi_level_cols1.stack()
  4948. weight
  4949. cat kg 1
  4950. pounds 2
  4951. dog kg 2
  4952. pounds 4
  4953. **Missing values**
  4954. >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
  4955. ... ('height', 'm')])
  4956. >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
  4957. ... index=['cat', 'dog'],
  4958. ... columns=multicol2)
  4959. It is common to have missing values when stacking a dataframe
  4960. with multi-level columns, as the stacked dataframe typically
  4961. has more values than the original dataframe. Missing values
  4962. are filled with NaNs:
  4963. >>> df_multi_level_cols2
  4964. weight height
  4965. kg m
  4966. cat 1.0 2.0
  4967. dog 3.0 4.0
  4968. >>> df_multi_level_cols2.stack()
  4969. height weight
  4970. cat kg NaN 1.0
  4971. m 2.0 NaN
  4972. dog kg NaN 3.0
  4973. m 4.0 NaN
  4974. **Prescribing the level(s) to be stacked**
  4975. The first parameter controls which level or levels are stacked:
  4976. >>> df_multi_level_cols2.stack(0)
  4977. kg m
  4978. cat height NaN 2.0
  4979. weight 1.0 NaN
  4980. dog height NaN 4.0
  4981. weight 3.0 NaN
  4982. >>> df_multi_level_cols2.stack([0, 1])
  4983. cat height m 2.0
  4984. weight kg 1.0
  4985. dog height m 4.0
  4986. weight kg 3.0
  4987. dtype: float64
  4988. **Dropping missing values**
  4989. >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
  4990. ... index=['cat', 'dog'],
  4991. ... columns=multicol2)
  4992. Note that rows where all values are missing are dropped by
  4993. default but this behaviour can be controlled via the dropna
  4994. keyword parameter:
  4995. >>> df_multi_level_cols3
  4996. weight height
  4997. kg m
  4998. cat NaN 1.0
  4999. dog 2.0 3.0
  5000. >>> df_multi_level_cols3.stack(dropna=False)
  5001. height weight
  5002. cat kg NaN NaN
  5003. m 1.0 NaN
  5004. dog kg NaN 2.0
  5005. m 3.0 NaN
  5006. >>> df_multi_level_cols3.stack(dropna=True)
  5007. height weight
  5008. cat m 1.0 NaN
  5009. dog kg NaN 2.0
  5010. m 3.0 NaN
  5011. """
  5012. from pandas.core.reshape.reshape import stack, stack_multiple
  5013. if isinstance(level, (tuple, list)):
  5014. return stack_multiple(self, level, dropna=dropna)
  5015. else:
  5016. return stack(self, level, dropna=dropna)
  5017. def unstack(self, level=-1, fill_value=None):
  5018. """
  5019. Pivot a level of the (necessarily hierarchical) index labels, returning
  5020. a DataFrame having a new level of column labels whose inner-most level
  5021. consists of the pivoted index labels.
  5022. If the index is not a MultiIndex, the output will be a Series
  5023. (the analogue of stack when the columns are not a MultiIndex).
  5024. The level involved will automatically get sorted.
  5025. Parameters
  5026. ----------
  5027. level : int, string, or list of these, default -1 (last level)
  5028. Level(s) of index to unstack, can pass level name
  5029. fill_value : replace NaN with this value if the unstack produces
  5030. missing values
  5031. .. versionadded:: 0.18.0
  5032. Returns
  5033. -------
  5034. Series or DataFrame
  5035. See Also
  5036. --------
  5037. DataFrame.pivot : Pivot a table based on column values.
  5038. DataFrame.stack : Pivot a level of the column labels (inverse operation
  5039. from `unstack`).
  5040. Examples
  5041. --------
  5042. >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
  5043. ... ('two', 'a'), ('two', 'b')])
  5044. >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
  5045. >>> s
  5046. one a 1.0
  5047. b 2.0
  5048. two a 3.0
  5049. b 4.0
  5050. dtype: float64
  5051. >>> s.unstack(level=-1)
  5052. a b
  5053. one 1.0 2.0
  5054. two 3.0 4.0
  5055. >>> s.unstack(level=0)
  5056. one two
  5057. a 1.0 3.0
  5058. b 2.0 4.0
  5059. >>> df = s.unstack(level=0)
  5060. >>> df.unstack()
  5061. one a 1.0
  5062. b 2.0
  5063. two a 3.0
  5064. b 4.0
  5065. dtype: float64
  5066. """
  5067. from pandas.core.reshape.reshape import unstack
  5068. return unstack(self, level, fill_value)
  5069. _shared_docs['melt'] = ("""
  5070. Unpivot a DataFrame from wide format to long format, optionally
  5071. leaving identifier variables set.
  5072. This function is useful to massage a DataFrame into a format where one
  5073. or more columns are identifier variables (`id_vars`), while all other
  5074. columns, considered measured variables (`value_vars`), are "unpivoted" to
  5075. the row axis, leaving just two non-identifier columns, 'variable' and
  5076. 'value'.
  5077. %(versionadded)s
  5078. Parameters
  5079. ----------
  5080. frame : DataFrame
  5081. id_vars : tuple, list, or ndarray, optional
  5082. Column(s) to use as identifier variables.
  5083. value_vars : tuple, list, or ndarray, optional
  5084. Column(s) to unpivot. If not specified, uses all columns that
  5085. are not set as `id_vars`.
  5086. var_name : scalar
  5087. Name to use for the 'variable' column. If None it uses
  5088. ``frame.columns.name`` or 'variable'.
  5089. value_name : scalar, default 'value'
  5090. Name to use for the 'value' column.
  5091. col_level : int or string, optional
  5092. If columns are a MultiIndex then use this level to melt.
  5093. See Also
  5094. --------
  5095. %(other)s
  5096. pivot_table
  5097. DataFrame.pivot
  5098. Examples
  5099. --------
  5100. >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
  5101. ... 'B': {0: 1, 1: 3, 2: 5},
  5102. ... 'C': {0: 2, 1: 4, 2: 6}})
  5103. >>> df
  5104. A B C
  5105. 0 a 1 2
  5106. 1 b 3 4
  5107. 2 c 5 6
  5108. >>> %(caller)sid_vars=['A'], value_vars=['B'])
  5109. A variable value
  5110. 0 a B 1
  5111. 1 b B 3
  5112. 2 c B 5
  5113. >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
  5114. A variable value
  5115. 0 a B 1
  5116. 1 b B 3
  5117. 2 c B 5
  5118. 3 a C 2
  5119. 4 b C 4
  5120. 5 c C 6
  5121. The names of 'variable' and 'value' columns can be customized:
  5122. >>> %(caller)sid_vars=['A'], value_vars=['B'],
  5123. ... var_name='myVarname', value_name='myValname')
  5124. A myVarname myValname
  5125. 0 a B 1
  5126. 1 b B 3
  5127. 2 c B 5
  5128. If you have multi-index columns:
  5129. >>> df.columns = [list('ABC'), list('DEF')]
  5130. >>> df
  5131. A B C
  5132. D E F
  5133. 0 a 1 2
  5134. 1 b 3 4
  5135. 2 c 5 6
  5136. >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
  5137. A variable value
  5138. 0 a B 1
  5139. 1 b B 3
  5140. 2 c B 5
  5141. >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
  5142. (A, D) variable_0 variable_1 value
  5143. 0 a B E 1
  5144. 1 b B E 3
  5145. 2 c B E 5
  5146. """)
  5147. @Appender(_shared_docs['melt'] %
  5148. dict(caller='df.melt(',
  5149. versionadded='.. versionadded:: 0.20.0\n',
  5150. other='melt'))
  5151. def melt(self, id_vars=None, value_vars=None, var_name=None,
  5152. value_name='value', col_level=None):
  5153. from pandas.core.reshape.melt import melt
  5154. return melt(self, id_vars=id_vars, value_vars=value_vars,
  5155. var_name=var_name, value_name=value_name,
  5156. col_level=col_level)
  5157. # ----------------------------------------------------------------------
  5158. # Time series-related
  5159. def diff(self, periods=1, axis=0):
  5160. """
  5161. First discrete difference of element.
  5162. Calculates the difference of a DataFrame element compared with another
  5163. element in the DataFrame (default is the element in the same column
  5164. of the previous row).
  5165. Parameters
  5166. ----------
  5167. periods : int, default 1
  5168. Periods to shift for calculating difference, accepts negative
  5169. values.
  5170. axis : {0 or 'index', 1 or 'columns'}, default 0
  5171. Take difference over rows (0) or columns (1).
  5172. .. versionadded:: 0.16.1.
  5173. Returns
  5174. -------
  5175. DataFrame
  5176. See Also
  5177. --------
  5178. Series.diff: First discrete difference for a Series.
  5179. DataFrame.pct_change: Percent change over given number of periods.
  5180. DataFrame.shift: Shift index by desired number of periods with an
  5181. optional time freq.
  5182. Examples
  5183. --------
  5184. Difference with previous row
  5185. >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
  5186. ... 'b': [1, 1, 2, 3, 5, 8],
  5187. ... 'c': [1, 4, 9, 16, 25, 36]})
  5188. >>> df
  5189. a b c
  5190. 0 1 1 1
  5191. 1 2 1 4
  5192. 2 3 2 9
  5193. 3 4 3 16
  5194. 4 5 5 25
  5195. 5 6 8 36
  5196. >>> df.diff()
  5197. a b c
  5198. 0 NaN NaN NaN
  5199. 1 1.0 0.0 3.0
  5200. 2 1.0 1.0 5.0
  5201. 3 1.0 1.0 7.0
  5202. 4 1.0 2.0 9.0
  5203. 5 1.0 3.0 11.0
  5204. Difference with previous column
  5205. >>> df.diff(axis=1)
  5206. a b c
  5207. 0 NaN 0.0 0.0
  5208. 1 NaN -1.0 3.0
  5209. 2 NaN -1.0 7.0
  5210. 3 NaN -1.0 13.0
  5211. 4 NaN 0.0 20.0
  5212. 5 NaN 2.0 28.0
  5213. Difference with 3rd previous row
  5214. >>> df.diff(periods=3)
  5215. a b c
  5216. 0 NaN NaN NaN
  5217. 1 NaN NaN NaN
  5218. 2 NaN NaN NaN
  5219. 3 3.0 2.0 15.0
  5220. 4 3.0 4.0 21.0
  5221. 5 3.0 6.0 27.0
  5222. Difference with following row
  5223. >>> df.diff(periods=-1)
  5224. a b c
  5225. 0 -1.0 0.0 -3.0
  5226. 1 -1.0 -1.0 -5.0
  5227. 2 -1.0 -1.0 -7.0
  5228. 3 -1.0 -2.0 -9.0
  5229. 4 -1.0 -3.0 -11.0
  5230. 5 NaN NaN NaN
  5231. """
  5232. bm_axis = self._get_block_manager_axis(axis)
  5233. new_data = self._data.diff(n=periods, axis=bm_axis)
  5234. return self._constructor(new_data)
  5235. # ----------------------------------------------------------------------
  5236. # Function application
  5237. def _gotitem(self,
  5238. key, # type: Union[str, List[str]]
  5239. ndim, # type: int
  5240. subset=None # type: Union[Series, DataFrame, None]
  5241. ):
  5242. # type: (...) -> Union[Series, DataFrame]
  5243. """
  5244. Sub-classes to define. Return a sliced object.
  5245. Parameters
  5246. ----------
  5247. key : string / list of selections
  5248. ndim : 1,2
  5249. requested ndim of result
  5250. subset : object, default None
  5251. subset to act on
  5252. """
  5253. if subset is None:
  5254. subset = self
  5255. elif subset.ndim == 1: # is Series
  5256. return subset
  5257. # TODO: _shallow_copy(subset)?
  5258. return subset[key]
  5259. _agg_summary_and_see_also_doc = dedent("""
  5260. The aggregation operations are always performed over an axis, either the
  5261. index (default) or the column axis. This behavior is different from
  5262. `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
  5263. `var`), where the default is to compute the aggregation of the flattened
  5264. array, e.g., ``numpy.mean(arr_2d)`` as opposed to ``numpy.mean(arr_2d,
  5265. axis=0)``.
  5266. `agg` is an alias for `aggregate`. Use the alias.
  5267. See Also
  5268. --------
  5269. DataFrame.apply : Perform any type of operations.
  5270. DataFrame.transform : Perform transformation type operations.
  5271. core.groupby.GroupBy : Perform operations over groups.
  5272. core.resample.Resampler : Perform operations over resampled bins.
  5273. core.window.Rolling : Perform operations over rolling window.
  5274. core.window.Expanding : Perform operations over expanding window.
  5275. core.window.EWM : Perform operation over exponential weighted
  5276. window.
  5277. """)
  5278. _agg_examples_doc = dedent("""
  5279. Examples
  5280. --------
  5281. >>> df = pd.DataFrame([[1, 2, 3],
  5282. ... [4, 5, 6],
  5283. ... [7, 8, 9],
  5284. ... [np.nan, np.nan, np.nan]],
  5285. ... columns=['A', 'B', 'C'])
  5286. Aggregate these functions over the rows.
  5287. >>> df.agg(['sum', 'min'])
  5288. A B C
  5289. sum 12.0 15.0 18.0
  5290. min 1.0 2.0 3.0
  5291. Different aggregations per column.
  5292. >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
  5293. A B
  5294. max NaN 8.0
  5295. min 1.0 2.0
  5296. sum 12.0 NaN
  5297. Aggregate over the columns.
  5298. >>> df.agg("mean", axis="columns")
  5299. 0 2.0
  5300. 1 5.0
  5301. 2 8.0
  5302. 3 NaN
  5303. dtype: float64
  5304. """)
  5305. @Substitution(see_also=_agg_summary_and_see_also_doc,
  5306. examples=_agg_examples_doc,
  5307. versionadded='.. versionadded:: 0.20.0',
  5308. **_shared_doc_kwargs)
  5309. @Appender(_shared_docs['aggregate'])
  5310. def aggregate(self, func, axis=0, *args, **kwargs):
  5311. axis = self._get_axis_number(axis)
  5312. result = None
  5313. try:
  5314. result, how = self._aggregate(func, axis=axis, *args, **kwargs)
  5315. except TypeError:
  5316. pass
  5317. if result is None:
  5318. return self.apply(func, axis=axis, args=args, **kwargs)
  5319. return result
  5320. def _aggregate(self, arg, axis=0, *args, **kwargs):
  5321. if axis == 1:
  5322. # NDFrame.aggregate returns a tuple, and we need to transpose
  5323. # only result
  5324. result, how = (super(DataFrame, self.T)
  5325. ._aggregate(arg, *args, **kwargs))
  5326. result = result.T if result is not None else result
  5327. return result, how
  5328. return super(DataFrame, self)._aggregate(arg, *args, **kwargs)
  5329. agg = aggregate
  5330. @Appender(_shared_docs['transform'] % _shared_doc_kwargs)
  5331. def transform(self, func, axis=0, *args, **kwargs):
  5332. axis = self._get_axis_number(axis)
  5333. if axis == 1:
  5334. return super(DataFrame, self.T).transform(func, *args, **kwargs).T
  5335. return super(DataFrame, self).transform(func, *args, **kwargs)
  5336. def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,
  5337. result_type=None, args=(), **kwds):
  5338. """
  5339. Apply a function along an axis of the DataFrame.
  5340. Objects passed to the function are Series objects whose index is
  5341. either the DataFrame's index (``axis=0``) or the DataFrame's columns
  5342. (``axis=1``). By default (``result_type=None``), the final return type
  5343. is inferred from the return type of the applied function. Otherwise,
  5344. it depends on the `result_type` argument.
  5345. Parameters
  5346. ----------
  5347. func : function
  5348. Function to apply to each column or row.
  5349. axis : {0 or 'index', 1 or 'columns'}, default 0
  5350. Axis along which the function is applied:
  5351. * 0 or 'index': apply function to each column.
  5352. * 1 or 'columns': apply function to each row.
  5353. broadcast : bool, optional
  5354. Only relevant for aggregation functions:
  5355. * ``False`` or ``None`` : returns a Series whose length is the
  5356. length of the index or the number of columns (based on the
  5357. `axis` parameter)
  5358. * ``True`` : results will be broadcast to the original shape
  5359. of the frame, the original index and columns will be retained.
  5360. .. deprecated:: 0.23.0
  5361. This argument will be removed in a future version, replaced
  5362. by result_type='broadcast'.
  5363. raw : bool, default False
  5364. * ``False`` : passes each row or column as a Series to the
  5365. function.
  5366. * ``True`` : the passed function will receive ndarray objects
  5367. instead.
  5368. If you are just applying a NumPy reduction function this will
  5369. achieve much better performance.
  5370. reduce : bool or None, default None
  5371. Try to apply reduction procedures. If the DataFrame is empty,
  5372. `apply` will use `reduce` to determine whether the result
  5373. should be a Series or a DataFrame. If ``reduce=None`` (the
  5374. default), `apply`'s return value will be guessed by calling
  5375. `func` on an empty Series
  5376. (note: while guessing, exceptions raised by `func` will be
  5377. ignored).
  5378. If ``reduce=True`` a Series will always be returned, and if
  5379. ``reduce=False`` a DataFrame will always be returned.
  5380. .. deprecated:: 0.23.0
  5381. This argument will be removed in a future version, replaced
  5382. by ``result_type='reduce'``.
  5383. result_type : {'expand', 'reduce', 'broadcast', None}, default None
  5384. These only act when ``axis=1`` (columns):
  5385. * 'expand' : list-like results will be turned into columns.
  5386. * 'reduce' : returns a Series if possible rather than expanding
  5387. list-like results. This is the opposite of 'expand'.
  5388. * 'broadcast' : results will be broadcast to the original shape
  5389. of the DataFrame, the original index and columns will be
  5390. retained.
  5391. The default behaviour (None) depends on the return value of the
  5392. applied function: list-like results will be returned as a Series
  5393. of those. However if the apply function returns a Series these
  5394. are expanded to columns.
  5395. .. versionadded:: 0.23.0
  5396. args : tuple
  5397. Positional arguments to pass to `func` in addition to the
  5398. array/series.
  5399. **kwds
  5400. Additional keyword arguments to pass as keywords arguments to
  5401. `func`.
  5402. Returns
  5403. -------
  5404. Series or DataFrame
  5405. Result of applying ``func`` along the given axis of the
  5406. DataFrame.
  5407. See Also
  5408. --------
  5409. DataFrame.applymap: For elementwise operations.
  5410. DataFrame.aggregate: Only perform aggregating type operations.
  5411. DataFrame.transform: Only perform transforming type operations.
  5412. Notes
  5413. -----
  5414. In the current implementation apply calls `func` twice on the
  5415. first column/row to decide whether it can take a fast or slow
  5416. code path. This can lead to unexpected behavior if `func` has
  5417. side-effects, as they will take effect twice for the first
  5418. column/row.
  5419. Examples
  5420. --------
  5421. >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
  5422. >>> df
  5423. A B
  5424. 0 4 9
  5425. 1 4 9
  5426. 2 4 9
  5427. Using a numpy universal function (in this case the same as
  5428. ``np.sqrt(df)``):
  5429. >>> df.apply(np.sqrt)
  5430. A B
  5431. 0 2.0 3.0
  5432. 1 2.0 3.0
  5433. 2 2.0 3.0
  5434. Using a reducing function on either axis
  5435. >>> df.apply(np.sum, axis=0)
  5436. A 12
  5437. B 27
  5438. dtype: int64
  5439. >>> df.apply(np.sum, axis=1)
  5440. 0 13
  5441. 1 13
  5442. 2 13
  5443. dtype: int64
  5444. Retuning a list-like will result in a Series
  5445. >>> df.apply(lambda x: [1, 2], axis=1)
  5446. 0 [1, 2]
  5447. 1 [1, 2]
  5448. 2 [1, 2]
  5449. dtype: object
  5450. Passing result_type='expand' will expand list-like results
  5451. to columns of a Dataframe
  5452. >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
  5453. 0 1
  5454. 0 1 2
  5455. 1 1 2
  5456. 2 1 2
  5457. Returning a Series inside the function is similar to passing
  5458. ``result_type='expand'``. The resulting column names
  5459. will be the Series index.
  5460. >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
  5461. foo bar
  5462. 0 1 2
  5463. 1 1 2
  5464. 2 1 2
  5465. Passing ``result_type='broadcast'`` will ensure the same shape
  5466. result, whether list-like or scalar is returned by the function,
  5467. and broadcast it along the axis. The resulting column names will
  5468. be the originals.
  5469. >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
  5470. A B
  5471. 0 1 2
  5472. 1 1 2
  5473. 2 1 2
  5474. """
  5475. from pandas.core.apply import frame_apply
  5476. op = frame_apply(self,
  5477. func=func,
  5478. axis=axis,
  5479. broadcast=broadcast,
  5480. raw=raw,
  5481. reduce=reduce,
  5482. result_type=result_type,
  5483. args=args,
  5484. kwds=kwds)
  5485. return op.get_result()
  5486. def applymap(self, func):
  5487. """
  5488. Apply a function to a Dataframe elementwise.
  5489. This method applies a function that accepts and returns a scalar
  5490. to every element of a DataFrame.
  5491. Parameters
  5492. ----------
  5493. func : callable
  5494. Python function, returns a single value from a single value.
  5495. Returns
  5496. -------
  5497. DataFrame
  5498. Transformed DataFrame.
  5499. See Also
  5500. --------
  5501. DataFrame.apply : Apply a function along input axis of DataFrame.
  5502. Notes
  5503. -----
  5504. In the current implementation applymap calls `func` twice on the
  5505. first column/row to decide whether it can take a fast or slow
  5506. code path. This can lead to unexpected behavior if `func` has
  5507. side-effects, as they will take effect twice for the first
  5508. column/row.
  5509. Examples
  5510. --------
  5511. >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
  5512. >>> df
  5513. 0 1
  5514. 0 1.000 2.120
  5515. 1 3.356 4.567
  5516. >>> df.applymap(lambda x: len(str(x)))
  5517. 0 1
  5518. 0 3 4
  5519. 1 5 5
  5520. Note that a vectorized version of `func` often exists, which will
  5521. be much faster. You could square each number elementwise.
  5522. >>> df.applymap(lambda x: x**2)
  5523. 0 1
  5524. 0 1.000000 4.494400
  5525. 1 11.262736 20.857489
  5526. But it's better to avoid applymap in that case.
  5527. >>> df ** 2
  5528. 0 1
  5529. 0 1.000000 4.494400
  5530. 1 11.262736 20.857489
  5531. """
  5532. # if we have a dtype == 'M8[ns]', provide boxed values
  5533. def infer(x):
  5534. if x.empty:
  5535. return lib.map_infer(x, func)
  5536. return lib.map_infer(x.astype(object).values, func)
  5537. return self.apply(infer)
  5538. # ----------------------------------------------------------------------
  5539. # Merging / joining methods
  5540. def append(self, other, ignore_index=False,
  5541. verify_integrity=False, sort=None):
  5542. """
  5543. Append rows of `other` to the end of caller, returning a new object.
  5544. Columns in `other` that are not in the caller are added as new columns.
  5545. Parameters
  5546. ----------
  5547. other : DataFrame or Series/dict-like object, or list of these
  5548. The data to append.
  5549. ignore_index : boolean, default False
  5550. If True, do not use the index labels.
  5551. verify_integrity : boolean, default False
  5552. If True, raise ValueError on creating index with duplicates.
  5553. sort : boolean, default None
  5554. Sort columns if the columns of `self` and `other` are not aligned.
  5555. The default sorting is deprecated and will change to not-sorting
  5556. in a future version of pandas. Explicitly pass ``sort=True`` to
  5557. silence the warning and sort. Explicitly pass ``sort=False`` to
  5558. silence the warning and not sort.
  5559. .. versionadded:: 0.23.0
  5560. Returns
  5561. -------
  5562. DataFrame
  5563. See Also
  5564. --------
  5565. concat : General function to concatenate DataFrame, Series
  5566. or Panel objects.
  5567. Notes
  5568. -----
  5569. If a list of dict/series is passed and the keys are all contained in
  5570. the DataFrame's index, the order of the columns in the resulting
  5571. DataFrame will be unchanged.
  5572. Iteratively appending rows to a DataFrame can be more computationally
  5573. intensive than a single concatenate. A better solution is to append
  5574. those rows to a list and then concatenate the list with the original
  5575. DataFrame all at once.
  5576. Examples
  5577. --------
  5578. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
  5579. >>> df
  5580. A B
  5581. 0 1 2
  5582. 1 3 4
  5583. >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
  5584. >>> df.append(df2)
  5585. A B
  5586. 0 1 2
  5587. 1 3 4
  5588. 0 5 6
  5589. 1 7 8
  5590. With `ignore_index` set to True:
  5591. >>> df.append(df2, ignore_index=True)
  5592. A B
  5593. 0 1 2
  5594. 1 3 4
  5595. 2 5 6
  5596. 3 7 8
  5597. The following, while not recommended methods for generating DataFrames,
  5598. show two ways to generate a DataFrame from multiple data sources.
  5599. Less efficient:
  5600. >>> df = pd.DataFrame(columns=['A'])
  5601. >>> for i in range(5):
  5602. ... df = df.append({'A': i}, ignore_index=True)
  5603. >>> df
  5604. A
  5605. 0 0
  5606. 1 1
  5607. 2 2
  5608. 3 3
  5609. 4 4
  5610. More efficient:
  5611. >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
  5612. ... ignore_index=True)
  5613. A
  5614. 0 0
  5615. 1 1
  5616. 2 2
  5617. 3 3
  5618. 4 4
  5619. """
  5620. if isinstance(other, (Series, dict)):
  5621. if isinstance(other, dict):
  5622. other = Series(other)
  5623. if other.name is None and not ignore_index:
  5624. raise TypeError('Can only append a Series if ignore_index=True'
  5625. ' or if the Series has a name')
  5626. if other.name is None:
  5627. index = None
  5628. else:
  5629. # other must have the same index name as self, otherwise
  5630. # index name will be reset
  5631. index = Index([other.name], name=self.index.name)
  5632. idx_diff = other.index.difference(self.columns)
  5633. try:
  5634. combined_columns = self.columns.append(idx_diff)
  5635. except TypeError:
  5636. combined_columns = self.columns.astype(object).append(idx_diff)
  5637. other = other.reindex(combined_columns, copy=False)
  5638. other = DataFrame(other.values.reshape((1, len(other))),
  5639. index=index,
  5640. columns=combined_columns)
  5641. other = other._convert(datetime=True, timedelta=True)
  5642. if not self.columns.equals(combined_columns):
  5643. self = self.reindex(columns=combined_columns)
  5644. elif isinstance(other, list) and not isinstance(other[0], DataFrame):
  5645. other = DataFrame(other)
  5646. if (self.columns.get_indexer(other.columns) >= 0).all():
  5647. other = other.loc[:, self.columns]
  5648. from pandas.core.reshape.concat import concat
  5649. if isinstance(other, (list, tuple)):
  5650. to_concat = [self] + other
  5651. else:
  5652. to_concat = [self, other]
  5653. return concat(to_concat, ignore_index=ignore_index,
  5654. verify_integrity=verify_integrity,
  5655. sort=sort)
  5656. def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
  5657. sort=False):
  5658. """
  5659. Join columns of another DataFrame.
  5660. Join columns with `other` DataFrame either on index or on a key
  5661. column. Efficiently join multiple DataFrame objects by index at once by
  5662. passing a list.
  5663. Parameters
  5664. ----------
  5665. other : DataFrame, Series, or list of DataFrame
  5666. Index should be similar to one of the columns in this one. If a
  5667. Series is passed, its name attribute must be set, and that will be
  5668. used as the column name in the resulting joined DataFrame.
  5669. on : str, list of str, or array-like, optional
  5670. Column or index level name(s) in the caller to join on the index
  5671. in `other`, otherwise joins index-on-index. If multiple
  5672. values given, the `other` DataFrame must have a MultiIndex. Can
  5673. pass an array as the join key if it is not already contained in
  5674. the calling DataFrame. Like an Excel VLOOKUP operation.
  5675. how : {'left', 'right', 'outer', 'inner'}, default 'left'
  5676. How to handle the operation of the two objects.
  5677. * left: use calling frame's index (or column if on is specified)
  5678. * right: use `other`'s index.
  5679. * outer: form union of calling frame's index (or column if on is
  5680. specified) with `other`'s index, and sort it.
  5681. lexicographically.
  5682. * inner: form intersection of calling frame's index (or column if
  5683. on is specified) with `other`'s index, preserving the order
  5684. of the calling's one.
  5685. lsuffix : str, default ''
  5686. Suffix to use from left frame's overlapping columns.
  5687. rsuffix : str, default ''
  5688. Suffix to use from right frame's overlapping columns.
  5689. sort : bool, default False
  5690. Order result DataFrame lexicographically by the join key. If False,
  5691. the order of the join key depends on the join type (how keyword).
  5692. Returns
  5693. -------
  5694. DataFrame
  5695. A dataframe containing columns from both the caller and `other`.
  5696. See Also
  5697. --------
  5698. DataFrame.merge : For column(s)-on-columns(s) operations.
  5699. Notes
  5700. -----
  5701. Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
  5702. passing a list of `DataFrame` objects.
  5703. Support for specifying index levels as the `on` parameter was added
  5704. in version 0.23.0.
  5705. Examples
  5706. --------
  5707. >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
  5708. ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
  5709. >>> df
  5710. key A
  5711. 0 K0 A0
  5712. 1 K1 A1
  5713. 2 K2 A2
  5714. 3 K3 A3
  5715. 4 K4 A4
  5716. 5 K5 A5
  5717. >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
  5718. ... 'B': ['B0', 'B1', 'B2']})
  5719. >>> other
  5720. key B
  5721. 0 K0 B0
  5722. 1 K1 B1
  5723. 2 K2 B2
  5724. Join DataFrames using their indexes.
  5725. >>> df.join(other, lsuffix='_caller', rsuffix='_other')
  5726. key_caller A key_other B
  5727. 0 K0 A0 K0 B0
  5728. 1 K1 A1 K1 B1
  5729. 2 K2 A2 K2 B2
  5730. 3 K3 A3 NaN NaN
  5731. 4 K4 A4 NaN NaN
  5732. 5 K5 A5 NaN NaN
  5733. If we want to join using the key columns, we need to set key to be
  5734. the index in both `df` and `other`. The joined DataFrame will have
  5735. key as its index.
  5736. >>> df.set_index('key').join(other.set_index('key'))
  5737. A B
  5738. key
  5739. K0 A0 B0
  5740. K1 A1 B1
  5741. K2 A2 B2
  5742. K3 A3 NaN
  5743. K4 A4 NaN
  5744. K5 A5 NaN
  5745. Another option to join using the key columns is to use the `on`
  5746. parameter. DataFrame.join always uses `other`'s index but we can use
  5747. any column in `df`. This method preserves the original DataFrame's
  5748. index in the result.
  5749. >>> df.join(other.set_index('key'), on='key')
  5750. key A B
  5751. 0 K0 A0 B0
  5752. 1 K1 A1 B1
  5753. 2 K2 A2 B2
  5754. 3 K3 A3 NaN
  5755. 4 K4 A4 NaN
  5756. 5 K5 A5 NaN
  5757. """
  5758. # For SparseDataFrame's benefit
  5759. return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
  5760. rsuffix=rsuffix, sort=sort)
  5761. def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
  5762. sort=False):
  5763. from pandas.core.reshape.merge import merge
  5764. from pandas.core.reshape.concat import concat
  5765. if isinstance(other, Series):
  5766. if other.name is None:
  5767. raise ValueError('Other Series must have a name')
  5768. other = DataFrame({other.name: other})
  5769. if isinstance(other, DataFrame):
  5770. return merge(self, other, left_on=on, how=how,
  5771. left_index=on is None, right_index=True,
  5772. suffixes=(lsuffix, rsuffix), sort=sort)
  5773. else:
  5774. if on is not None:
  5775. raise ValueError('Joining multiple DataFrames only supported'
  5776. ' for joining on index')
  5777. frames = [self] + list(other)
  5778. can_concat = all(df.index.is_unique for df in frames)
  5779. # join indexes only using concat
  5780. if can_concat:
  5781. if how == 'left':
  5782. how = 'outer'
  5783. join_axes = [self.index]
  5784. else:
  5785. join_axes = None
  5786. return concat(frames, axis=1, join=how, join_axes=join_axes,
  5787. verify_integrity=True)
  5788. joined = frames[0]
  5789. for frame in frames[1:]:
  5790. joined = merge(joined, frame, how=how, left_index=True,
  5791. right_index=True)
  5792. return joined
  5793. @Substitution('')
  5794. @Appender(_merge_doc, indents=2)
  5795. def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
  5796. left_index=False, right_index=False, sort=False,
  5797. suffixes=('_x', '_y'), copy=True, indicator=False,
  5798. validate=None):
  5799. from pandas.core.reshape.merge import merge
  5800. return merge(self, right, how=how, on=on, left_on=left_on,
  5801. right_on=right_on, left_index=left_index,
  5802. right_index=right_index, sort=sort, suffixes=suffixes,
  5803. copy=copy, indicator=indicator, validate=validate)
  5804. def round(self, decimals=0, *args, **kwargs):
  5805. """
  5806. Round a DataFrame to a variable number of decimal places.
  5807. Parameters
  5808. ----------
  5809. decimals : int, dict, Series
  5810. Number of decimal places to round each column to. If an int is
  5811. given, round each column to the same number of places.
  5812. Otherwise dict and Series round to variable numbers of places.
  5813. Column names should be in the keys if `decimals` is a
  5814. dict-like, or in the index if `decimals` is a Series. Any
  5815. columns not included in `decimals` will be left as is. Elements
  5816. of `decimals` which are not columns of the input will be
  5817. ignored.
  5818. *args
  5819. Additional keywords have no effect but might be accepted for
  5820. compatibility with numpy.
  5821. **kwargs
  5822. Additional keywords have no effect but might be accepted for
  5823. compatibility with numpy.
  5824. Returns
  5825. -------
  5826. DataFrame
  5827. A DataFrame with the affected columns rounded to the specified
  5828. number of decimal places.
  5829. See Also
  5830. --------
  5831. numpy.around : Round a numpy array to the given number of decimals.
  5832. Series.round : Round a Series to the given number of decimals.
  5833. Examples
  5834. --------
  5835. >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
  5836. ... columns=['dogs', 'cats'])
  5837. >>> df
  5838. dogs cats
  5839. 0 0.21 0.32
  5840. 1 0.01 0.67
  5841. 2 0.66 0.03
  5842. 3 0.21 0.18
  5843. By providing an integer each column is rounded to the same number
  5844. of decimal places
  5845. >>> df.round(1)
  5846. dogs cats
  5847. 0 0.2 0.3
  5848. 1 0.0 0.7
  5849. 2 0.7 0.0
  5850. 3 0.2 0.2
  5851. With a dict, the number of places for specific columns can be
  5852. specfified with the column names as key and the number of decimal
  5853. places as value
  5854. >>> df.round({'dogs': 1, 'cats': 0})
  5855. dogs cats
  5856. 0 0.2 0.0
  5857. 1 0.0 1.0
  5858. 2 0.7 0.0
  5859. 3 0.2 0.0
  5860. Using a Series, the number of places for specific columns can be
  5861. specfified with the column names as index and the number of
  5862. decimal places as value
  5863. >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
  5864. >>> df.round(decimals)
  5865. dogs cats
  5866. 0 0.2 0.0
  5867. 1 0.0 1.0
  5868. 2 0.7 0.0
  5869. 3 0.2 0.0
  5870. """
  5871. from pandas.core.reshape.concat import concat
  5872. def _dict_round(df, decimals):
  5873. for col, vals in df.iteritems():
  5874. try:
  5875. yield _series_round(vals, decimals[col])
  5876. except KeyError:
  5877. yield vals
  5878. def _series_round(s, decimals):
  5879. if is_integer_dtype(s) or is_float_dtype(s):
  5880. return s.round(decimals)
  5881. return s
  5882. nv.validate_round(args, kwargs)
  5883. if isinstance(decimals, (dict, Series)):
  5884. if isinstance(decimals, Series):
  5885. if not decimals.index.is_unique:
  5886. raise ValueError("Index of decimals must be unique")
  5887. new_cols = [col for col in _dict_round(self, decimals)]
  5888. elif is_integer(decimals):
  5889. # Dispatch to Series.round
  5890. new_cols = [_series_round(v, decimals)
  5891. for _, v in self.iteritems()]
  5892. else:
  5893. raise TypeError("decimals must be an integer, a dict-like or a "
  5894. "Series")
  5895. if len(new_cols) > 0:
  5896. return self._constructor(concat(new_cols, axis=1),
  5897. index=self.index,
  5898. columns=self.columns)
  5899. else:
  5900. return self
  5901. # ----------------------------------------------------------------------
  5902. # Statistical methods, etc.
  5903. def corr(self, method='pearson', min_periods=1):
  5904. """
  5905. Compute pairwise correlation of columns, excluding NA/null values.
  5906. Parameters
  5907. ----------
  5908. method : {'pearson', 'kendall', 'spearman'} or callable
  5909. * pearson : standard correlation coefficient
  5910. * kendall : Kendall Tau correlation coefficient
  5911. * spearman : Spearman rank correlation
  5912. * callable: callable with input two 1d ndarrays
  5913. and returning a float
  5914. .. versionadded:: 0.24.0
  5915. min_periods : int, optional
  5916. Minimum number of observations required per pair of columns
  5917. to have a valid result. Currently only available for Pearson
  5918. and Spearman correlation.
  5919. Returns
  5920. -------
  5921. DataFrame
  5922. Correlation matrix.
  5923. See Also
  5924. --------
  5925. DataFrame.corrwith
  5926. Series.corr
  5927. Examples
  5928. --------
  5929. >>> def histogram_intersection(a, b):
  5930. ... v = np.minimum(a, b).sum().round(decimals=1)
  5931. ... return v
  5932. >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
  5933. ... columns=['dogs', 'cats'])
  5934. >>> df.corr(method=histogram_intersection)
  5935. dogs cats
  5936. dogs 1.0 0.3
  5937. cats 0.3 1.0
  5938. """
  5939. numeric_df = self._get_numeric_data()
  5940. cols = numeric_df.columns
  5941. idx = cols.copy()
  5942. mat = numeric_df.values
  5943. if method == 'pearson':
  5944. correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)
  5945. elif method == 'spearman':
  5946. correl = libalgos.nancorr_spearman(ensure_float64(mat),
  5947. minp=min_periods)
  5948. elif method == 'kendall' or callable(method):
  5949. if min_periods is None:
  5950. min_periods = 1
  5951. mat = ensure_float64(mat).T
  5952. corrf = nanops.get_corr_func(method)
  5953. K = len(cols)
  5954. correl = np.empty((K, K), dtype=float)
  5955. mask = np.isfinite(mat)
  5956. for i, ac in enumerate(mat):
  5957. for j, bc in enumerate(mat):
  5958. if i > j:
  5959. continue
  5960. valid = mask[i] & mask[j]
  5961. if valid.sum() < min_periods:
  5962. c = np.nan
  5963. elif i == j:
  5964. c = 1.
  5965. elif not valid.all():
  5966. c = corrf(ac[valid], bc[valid])
  5967. else:
  5968. c = corrf(ac, bc)
  5969. correl[i, j] = c
  5970. correl[j, i] = c
  5971. else:
  5972. raise ValueError("method must be either 'pearson', "
  5973. "'spearman', or 'kendall', '{method}' "
  5974. "was supplied".format(method=method))
  5975. return self._constructor(correl, index=idx, columns=cols)
  5976. def cov(self, min_periods=None):
  5977. """
  5978. Compute pairwise covariance of columns, excluding NA/null values.
  5979. Compute the pairwise covariance among the series of a DataFrame.
  5980. The returned data frame is the `covariance matrix
  5981. <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
  5982. of the DataFrame.
  5983. Both NA and null values are automatically excluded from the
  5984. calculation. (See the note below about bias from missing values.)
  5985. A threshold can be set for the minimum number of
  5986. observations for each value created. Comparisons with observations
  5987. below this threshold will be returned as ``NaN``.
  5988. This method is generally used for the analysis of time series data to
  5989. understand the relationship between different measures
  5990. across time.
  5991. Parameters
  5992. ----------
  5993. min_periods : int, optional
  5994. Minimum number of observations required per pair of columns
  5995. to have a valid result.
  5996. Returns
  5997. -------
  5998. DataFrame
  5999. The covariance matrix of the series of the DataFrame.
  6000. See Also
  6001. --------
  6002. Series.cov : Compute covariance with another Series.
  6003. core.window.EWM.cov: Exponential weighted sample covariance.
  6004. core.window.Expanding.cov : Expanding sample covariance.
  6005. core.window.Rolling.cov : Rolling sample covariance.
  6006. Notes
  6007. -----
  6008. Returns the covariance matrix of the DataFrame's time series.
  6009. The covariance is normalized by N-1.
  6010. For DataFrames that have Series that are missing data (assuming that
  6011. data is `missing at random
  6012. <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
  6013. the returned covariance matrix will be an unbiased estimate
  6014. of the variance and covariance between the member Series.
  6015. However, for many applications this estimate may not be acceptable
  6016. because the estimate covariance matrix is not guaranteed to be positive
  6017. semi-definite. This could lead to estimate correlations having
  6018. absolute values which are greater than one, and/or a non-invertible
  6019. covariance matrix. See `Estimation of covariance matrices
  6020. <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
  6021. matrices>`__ for more details.
  6022. Examples
  6023. --------
  6024. >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
  6025. ... columns=['dogs', 'cats'])
  6026. >>> df.cov()
  6027. dogs cats
  6028. dogs 0.666667 -1.000000
  6029. cats -1.000000 1.666667
  6030. >>> np.random.seed(42)
  6031. >>> df = pd.DataFrame(np.random.randn(1000, 5),
  6032. ... columns=['a', 'b', 'c', 'd', 'e'])
  6033. >>> df.cov()
  6034. a b c d e
  6035. a 0.998438 -0.020161 0.059277 -0.008943 0.014144
  6036. b -0.020161 1.059352 -0.008543 -0.024738 0.009826
  6037. c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
  6038. d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
  6039. e 0.014144 0.009826 -0.000271 -0.013692 0.977795
  6040. **Minimum number of periods**
  6041. This method also supports an optional ``min_periods`` keyword
  6042. that specifies the required minimum number of non-NA observations for
  6043. each column pair in order to have a valid result:
  6044. >>> np.random.seed(42)
  6045. >>> df = pd.DataFrame(np.random.randn(20, 3),
  6046. ... columns=['a', 'b', 'c'])
  6047. >>> df.loc[df.index[:5], 'a'] = np.nan
  6048. >>> df.loc[df.index[5:10], 'b'] = np.nan
  6049. >>> df.cov(min_periods=12)
  6050. a b c
  6051. a 0.316741 NaN -0.150812
  6052. b NaN 1.248003 0.191417
  6053. c -0.150812 0.191417 0.895202
  6054. """
  6055. numeric_df = self._get_numeric_data()
  6056. cols = numeric_df.columns
  6057. idx = cols.copy()
  6058. mat = numeric_df.values
  6059. if notna(mat).all():
  6060. if min_periods is not None and min_periods > len(mat):
  6061. baseCov = np.empty((mat.shape[1], mat.shape[1]))
  6062. baseCov.fill(np.nan)
  6063. else:
  6064. baseCov = np.cov(mat.T)
  6065. baseCov = baseCov.reshape((len(cols), len(cols)))
  6066. else:
  6067. baseCov = libalgos.nancorr(ensure_float64(mat), cov=True,
  6068. minp=min_periods)
  6069. return self._constructor(baseCov, index=idx, columns=cols)
  6070. def corrwith(self, other, axis=0, drop=False, method='pearson'):
  6071. """
  6072. Compute pairwise correlation between rows or columns of DataFrame
  6073. with rows or columns of Series or DataFrame. DataFrames are first
  6074. aligned along both axes before computing the correlations.
  6075. Parameters
  6076. ----------
  6077. other : DataFrame, Series
  6078. Object with which to compute correlations.
  6079. axis : {0 or 'index', 1 or 'columns'}, default 0
  6080. 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise.
  6081. drop : bool, default False
  6082. Drop missing indices from result.
  6083. method : {'pearson', 'kendall', 'spearman'} or callable
  6084. * pearson : standard correlation coefficient
  6085. * kendall : Kendall Tau correlation coefficient
  6086. * spearman : Spearman rank correlation
  6087. * callable: callable with input two 1d ndarrays
  6088. and returning a float
  6089. .. versionadded:: 0.24.0
  6090. Returns
  6091. -------
  6092. Series
  6093. Pairwise correlations.
  6094. See Also
  6095. -------
  6096. DataFrame.corr
  6097. """
  6098. axis = self._get_axis_number(axis)
  6099. this = self._get_numeric_data()
  6100. if isinstance(other, Series):
  6101. return this.apply(lambda x: other.corr(x, method=method),
  6102. axis=axis)
  6103. other = other._get_numeric_data()
  6104. left, right = this.align(other, join='inner', copy=False)
  6105. if axis == 1:
  6106. left = left.T
  6107. right = right.T
  6108. if method == 'pearson':
  6109. # mask missing values
  6110. left = left + right * 0
  6111. right = right + left * 0
  6112. # demeaned data
  6113. ldem = left - left.mean()
  6114. rdem = right - right.mean()
  6115. num = (ldem * rdem).sum()
  6116. dom = (left.count() - 1) * left.std() * right.std()
  6117. correl = num / dom
  6118. elif method in ['kendall', 'spearman'] or callable(method):
  6119. def c(x):
  6120. return nanops.nancorr(x[0], x[1], method=method)
  6121. correl = Series(map(c,
  6122. zip(left.values.T, right.values.T)),
  6123. index=left.columns)
  6124. else:
  6125. raise ValueError("Invalid method {method} was passed, "
  6126. "valid methods are: 'pearson', 'kendall', "
  6127. "'spearman', or callable".
  6128. format(method=method))
  6129. if not drop:
  6130. # Find non-matching labels along the given axis
  6131. # and append missing correlations (GH 22375)
  6132. raxis = 1 if axis == 0 else 0
  6133. result_index = (this._get_axis(raxis).
  6134. union(other._get_axis(raxis)))
  6135. idx_diff = result_index.difference(correl.index)
  6136. if len(idx_diff) > 0:
  6137. correl = correl.append(Series([np.nan] * len(idx_diff),
  6138. index=idx_diff))
  6139. return correl
  6140. # ----------------------------------------------------------------------
  6141. # ndarray-like stats methods
  6142. def count(self, axis=0, level=None, numeric_only=False):
  6143. """
  6144. Count non-NA cells for each column or row.
  6145. The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
  6146. on `pandas.options.mode.use_inf_as_na`) are considered NA.
  6147. Parameters
  6148. ----------
  6149. axis : {0 or 'index', 1 or 'columns'}, default 0
  6150. If 0 or 'index' counts are generated for each column.
  6151. If 1 or 'columns' counts are generated for each **row**.
  6152. level : int or str, optional
  6153. If the axis is a `MultiIndex` (hierarchical), count along a
  6154. particular `level`, collapsing into a `DataFrame`.
  6155. A `str` specifies the level name.
  6156. numeric_only : bool, default False
  6157. Include only `float`, `int` or `boolean` data.
  6158. Returns
  6159. -------
  6160. Series or DataFrame
  6161. For each column/row the number of non-NA/null entries.
  6162. If `level` is specified returns a `DataFrame`.
  6163. See Also
  6164. --------
  6165. Series.count: Number of non-NA elements in a Series.
  6166. DataFrame.shape: Number of DataFrame rows and columns (including NA
  6167. elements).
  6168. DataFrame.isna: Boolean same-sized DataFrame showing places of NA
  6169. elements.
  6170. Examples
  6171. --------
  6172. Constructing DataFrame from a dictionary:
  6173. >>> df = pd.DataFrame({"Person":
  6174. ... ["John", "Myla", "Lewis", "John", "Myla"],
  6175. ... "Age": [24., np.nan, 21., 33, 26],
  6176. ... "Single": [False, True, True, True, False]})
  6177. >>> df
  6178. Person Age Single
  6179. 0 John 24.0 False
  6180. 1 Myla NaN True
  6181. 2 Lewis 21.0 True
  6182. 3 John 33.0 True
  6183. 4 Myla 26.0 False
  6184. Notice the uncounted NA values:
  6185. >>> df.count()
  6186. Person 5
  6187. Age 4
  6188. Single 5
  6189. dtype: int64
  6190. Counts for each **row**:
  6191. >>> df.count(axis='columns')
  6192. 0 3
  6193. 1 2
  6194. 2 3
  6195. 3 3
  6196. 4 3
  6197. dtype: int64
  6198. Counts for one level of a `MultiIndex`:
  6199. >>> df.set_index(["Person", "Single"]).count(level="Person")
  6200. Age
  6201. Person
  6202. John 2
  6203. Lewis 1
  6204. Myla 1
  6205. """
  6206. axis = self._get_axis_number(axis)
  6207. if level is not None:
  6208. return self._count_level(level, axis=axis,
  6209. numeric_only=numeric_only)
  6210. if numeric_only:
  6211. frame = self._get_numeric_data()
  6212. else:
  6213. frame = self
  6214. # GH #423
  6215. if len(frame._get_axis(axis)) == 0:
  6216. result = Series(0, index=frame._get_agg_axis(axis))
  6217. else:
  6218. if frame._is_mixed_type or frame._data.any_extension_types:
  6219. # the or any_extension_types is really only hit for single-
  6220. # column frames with an extension array
  6221. result = notna(frame).sum(axis=axis)
  6222. else:
  6223. # GH13407
  6224. series_counts = notna(frame).sum(axis=axis)
  6225. counts = series_counts.values
  6226. result = Series(counts, index=frame._get_agg_axis(axis))
  6227. return result.astype('int64')
  6228. def _count_level(self, level, axis=0, numeric_only=False):
  6229. if numeric_only:
  6230. frame = self._get_numeric_data()
  6231. else:
  6232. frame = self
  6233. count_axis = frame._get_axis(axis)
  6234. agg_axis = frame._get_agg_axis(axis)
  6235. if not isinstance(count_axis, MultiIndex):
  6236. raise TypeError("Can only count levels on hierarchical "
  6237. "{ax}.".format(ax=self._get_axis_name(axis)))
  6238. if frame._is_mixed_type:
  6239. # Since we have mixed types, calling notna(frame.values) might
  6240. # upcast everything to object
  6241. mask = notna(frame).values
  6242. else:
  6243. # But use the speedup when we have homogeneous dtypes
  6244. mask = notna(frame.values)
  6245. if axis == 1:
  6246. # We're transposing the mask rather than frame to avoid potential
  6247. # upcasts to object, which induces a ~20x slowdown
  6248. mask = mask.T
  6249. if isinstance(level, compat.string_types):
  6250. level = count_axis._get_level_number(level)
  6251. level_index = count_axis.levels[level]
  6252. level_codes = ensure_int64(count_axis.codes[level])
  6253. counts = lib.count_level_2d(mask, level_codes, len(level_index),
  6254. axis=0)
  6255. result = DataFrame(counts, index=level_index, columns=agg_axis)
  6256. if axis == 1:
  6257. # Undo our earlier transpose
  6258. return result.T
  6259. else:
  6260. return result
  6261. def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
  6262. filter_type=None, **kwds):
  6263. if axis is None and filter_type == 'bool':
  6264. labels = None
  6265. constructor = None
  6266. else:
  6267. # TODO: Make other agg func handle axis=None properly
  6268. axis = self._get_axis_number(axis)
  6269. labels = self._get_agg_axis(axis)
  6270. constructor = self._constructor
  6271. def f(x):
  6272. return op(x, axis=axis, skipna=skipna, **kwds)
  6273. # exclude timedelta/datetime unless we are uniform types
  6274. if (axis == 1 and self._is_datelike_mixed_type
  6275. and (not self._is_homogeneous_type
  6276. and not is_datetime64tz_dtype(self.dtypes[0]))):
  6277. numeric_only = True
  6278. if numeric_only is None:
  6279. try:
  6280. values = self.values
  6281. result = f(values)
  6282. if (filter_type == 'bool' and is_object_dtype(values) and
  6283. axis is None):
  6284. # work around https://github.com/numpy/numpy/issues/10489
  6285. # TODO: combine with hasattr(result, 'dtype') further down
  6286. # hard since we don't have `values` down there.
  6287. result = np.bool_(result)
  6288. except Exception as e:
  6289. # try by-column first
  6290. if filter_type is None and axis == 0:
  6291. try:
  6292. # this can end up with a non-reduction
  6293. # but not always. if the types are mixed
  6294. # with datelike then need to make sure a series
  6295. # we only end up here if we have not specified
  6296. # numeric_only and yet we have tried a
  6297. # column-by-column reduction, where we have mixed type.
  6298. # So let's just do what we can
  6299. from pandas.core.apply import frame_apply
  6300. opa = frame_apply(self,
  6301. func=f,
  6302. result_type='expand',
  6303. ignore_failures=True)
  6304. result = opa.get_result()
  6305. if result.ndim == self.ndim:
  6306. result = result.iloc[0]
  6307. return result
  6308. except Exception:
  6309. pass
  6310. if filter_type is None or filter_type == 'numeric':
  6311. data = self._get_numeric_data()
  6312. elif filter_type == 'bool':
  6313. data = self._get_bool_data()
  6314. else: # pragma: no cover
  6315. e = NotImplementedError(
  6316. "Handling exception with filter_type {f} not"
  6317. "implemented.".format(f=filter_type))
  6318. raise_with_traceback(e)
  6319. with np.errstate(all='ignore'):
  6320. result = f(data.values)
  6321. labels = data._get_agg_axis(axis)
  6322. else:
  6323. if numeric_only:
  6324. if filter_type is None or filter_type == 'numeric':
  6325. data = self._get_numeric_data()
  6326. elif filter_type == 'bool':
  6327. # GH 25101, # GH 24434
  6328. data = self._get_bool_data() if axis == 0 else self
  6329. else: # pragma: no cover
  6330. msg = ("Generating numeric_only data with filter_type {f}"
  6331. "not supported.".format(f=filter_type))
  6332. raise NotImplementedError(msg)
  6333. values = data.values
  6334. labels = data._get_agg_axis(axis)
  6335. else:
  6336. values = self.values
  6337. result = f(values)
  6338. if hasattr(result, 'dtype') and is_object_dtype(result.dtype):
  6339. try:
  6340. if filter_type is None or filter_type == 'numeric':
  6341. result = result.astype(np.float64)
  6342. elif filter_type == 'bool' and notna(result).all():
  6343. result = result.astype(np.bool_)
  6344. except (ValueError, TypeError):
  6345. # try to coerce to the original dtypes item by item if we can
  6346. if axis == 0:
  6347. result = coerce_to_dtypes(result, self.dtypes)
  6348. if constructor is not None:
  6349. result = Series(result, index=labels)
  6350. return result
  6351. def nunique(self, axis=0, dropna=True):
  6352. """
  6353. Count distinct observations over requested axis.
  6354. Return Series with number of distinct observations. Can ignore NaN
  6355. values.
  6356. .. versionadded:: 0.20.0
  6357. Parameters
  6358. ----------
  6359. axis : {0 or 'index', 1 or 'columns'}, default 0
  6360. The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
  6361. column-wise.
  6362. dropna : bool, default True
  6363. Don't include NaN in the counts.
  6364. Returns
  6365. -------
  6366. Series
  6367. See Also
  6368. --------
  6369. Series.nunique: Method nunique for Series.
  6370. DataFrame.count: Count non-NA cells for each column or row.
  6371. Examples
  6372. --------
  6373. >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
  6374. >>> df.nunique()
  6375. A 3
  6376. B 1
  6377. dtype: int64
  6378. >>> df.nunique(axis=1)
  6379. 0 1
  6380. 1 2
  6381. 2 2
  6382. dtype: int64
  6383. """
  6384. return self.apply(Series.nunique, axis=axis, dropna=dropna)
  6385. def idxmin(self, axis=0, skipna=True):
  6386. """
  6387. Return index of first occurrence of minimum over requested axis.
  6388. NA/null values are excluded.
  6389. Parameters
  6390. ----------
  6391. axis : {0 or 'index', 1 or 'columns'}, default 0
  6392. 0 or 'index' for row-wise, 1 or 'columns' for column-wise
  6393. skipna : boolean, default True
  6394. Exclude NA/null values. If an entire row/column is NA, the result
  6395. will be NA.
  6396. Returns
  6397. -------
  6398. Series
  6399. Indexes of minima along the specified axis.
  6400. Raises
  6401. ------
  6402. ValueError
  6403. * If the row/column is empty
  6404. See Also
  6405. --------
  6406. Series.idxmin
  6407. Notes
  6408. -----
  6409. This method is the DataFrame version of ``ndarray.argmin``.
  6410. """
  6411. axis = self._get_axis_number(axis)
  6412. indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
  6413. index = self._get_axis(axis)
  6414. result = [index[i] if i >= 0 else np.nan for i in indices]
  6415. return Series(result, index=self._get_agg_axis(axis))
  6416. def idxmax(self, axis=0, skipna=True):
  6417. """
  6418. Return index of first occurrence of maximum over requested axis.
  6419. NA/null values are excluded.
  6420. Parameters
  6421. ----------
  6422. axis : {0 or 'index', 1 or 'columns'}, default 0
  6423. 0 or 'index' for row-wise, 1 or 'columns' for column-wise
  6424. skipna : boolean, default True
  6425. Exclude NA/null values. If an entire row/column is NA, the result
  6426. will be NA.
  6427. Returns
  6428. -------
  6429. Series
  6430. Indexes of maxima along the specified axis.
  6431. Raises
  6432. ------
  6433. ValueError
  6434. * If the row/column is empty
  6435. See Also
  6436. --------
  6437. Series.idxmax
  6438. Notes
  6439. -----
  6440. This method is the DataFrame version of ``ndarray.argmax``.
  6441. """
  6442. axis = self._get_axis_number(axis)
  6443. indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
  6444. index = self._get_axis(axis)
  6445. result = [index[i] if i >= 0 else np.nan for i in indices]
  6446. return Series(result, index=self._get_agg_axis(axis))
  6447. def _get_agg_axis(self, axis_num):
  6448. """
  6449. Let's be explicit about this.
  6450. """
  6451. if axis_num == 0:
  6452. return self.columns
  6453. elif axis_num == 1:
  6454. return self.index
  6455. else:
  6456. raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
  6457. def mode(self, axis=0, numeric_only=False, dropna=True):
  6458. """
  6459. Get the mode(s) of each element along the selected axis.
  6460. The mode of a set of values is the value that appears most often.
  6461. It can be multiple values.
  6462. Parameters
  6463. ----------
  6464. axis : {0 or 'index', 1 or 'columns'}, default 0
  6465. The axis to iterate over while searching for the mode:
  6466. * 0 or 'index' : get mode of each column
  6467. * 1 or 'columns' : get mode of each row
  6468. numeric_only : bool, default False
  6469. If True, only apply to numeric columns.
  6470. dropna : bool, default True
  6471. Don't consider counts of NaN/NaT.
  6472. .. versionadded:: 0.24.0
  6473. Returns
  6474. -------
  6475. DataFrame
  6476. The modes of each column or row.
  6477. See Also
  6478. --------
  6479. Series.mode : Return the highest frequency value in a Series.
  6480. Series.value_counts : Return the counts of values in a Series.
  6481. Examples
  6482. --------
  6483. >>> df = pd.DataFrame([('bird', 2, 2),
  6484. ... ('mammal', 4, np.nan),
  6485. ... ('arthropod', 8, 0),
  6486. ... ('bird', 2, np.nan)],
  6487. ... index=('falcon', 'horse', 'spider', 'ostrich'),
  6488. ... columns=('species', 'legs', 'wings'))
  6489. >>> df
  6490. species legs wings
  6491. falcon bird 2 2.0
  6492. horse mammal 4 NaN
  6493. spider arthropod 8 0.0
  6494. ostrich bird 2 NaN
  6495. By default, missing values are not considered, and the mode of wings
  6496. are both 0 and 2. The second row of species and legs contains ``NaN``,
  6497. because they have only one mode, but the DataFrame has two rows.
  6498. >>> df.mode()
  6499. species legs wings
  6500. 0 bird 2.0 0.0
  6501. 1 NaN NaN 2.0
  6502. Setting ``dropna=False`` ``NaN`` values are considered and they can be
  6503. the mode (like for wings).
  6504. >>> df.mode(dropna=False)
  6505. species legs wings
  6506. 0 bird 2 NaN
  6507. Setting ``numeric_only=True``, only the mode of numeric columns is
  6508. computed, and columns of other types are ignored.
  6509. >>> df.mode(numeric_only=True)
  6510. legs wings
  6511. 0 2.0 0.0
  6512. 1 NaN 2.0
  6513. To compute the mode over columns and not rows, use the axis parameter:
  6514. >>> df.mode(axis='columns', numeric_only=True)
  6515. 0 1
  6516. falcon 2.0 NaN
  6517. horse 4.0 NaN
  6518. spider 0.0 8.0
  6519. ostrich 2.0 NaN
  6520. """
  6521. data = self if not numeric_only else self._get_numeric_data()
  6522. def f(s):
  6523. return s.mode(dropna=dropna)
  6524. return data.apply(f, axis=axis)
  6525. def quantile(self, q=0.5, axis=0, numeric_only=True,
  6526. interpolation='linear'):
  6527. """
  6528. Return values at the given quantile over requested axis.
  6529. Parameters
  6530. ----------
  6531. q : float or array-like, default 0.5 (50% quantile)
  6532. Value between 0 <= q <= 1, the quantile(s) to compute.
  6533. axis : {0, 1, 'index', 'columns'} (default 0)
  6534. Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
  6535. numeric_only : bool, default True
  6536. If False, the quantile of datetime and timedelta data will be
  6537. computed as well.
  6538. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
  6539. This optional parameter specifies the interpolation method to use,
  6540. when the desired quantile lies between two data points `i` and `j`:
  6541. * linear: `i + (j - i) * fraction`, where `fraction` is the
  6542. fractional part of the index surrounded by `i` and `j`.
  6543. * lower: `i`.
  6544. * higher: `j`.
  6545. * nearest: `i` or `j` whichever is nearest.
  6546. * midpoint: (`i` + `j`) / 2.
  6547. .. versionadded:: 0.18.0
  6548. Returns
  6549. -------
  6550. Series or DataFrame
  6551. If ``q`` is an array, a DataFrame will be returned where the
  6552. index is ``q``, the columns are the columns of self, and the
  6553. values are the quantiles.
  6554. If ``q`` is a float, a Series will be returned where the
  6555. index is the columns of self and the values are the quantiles.
  6556. See Also
  6557. --------
  6558. core.window.Rolling.quantile: Rolling quantile.
  6559. numpy.percentile: Numpy function to compute the percentile.
  6560. Examples
  6561. --------
  6562. >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
  6563. ... columns=['a', 'b'])
  6564. >>> df.quantile(.1)
  6565. a 1.3
  6566. b 3.7
  6567. Name: 0.1, dtype: float64
  6568. >>> df.quantile([.1, .5])
  6569. a b
  6570. 0.1 1.3 3.7
  6571. 0.5 2.5 55.0
  6572. Specifying `numeric_only=False` will also compute the quantile of
  6573. datetime and timedelta data.
  6574. >>> df = pd.DataFrame({'A': [1, 2],
  6575. ... 'B': [pd.Timestamp('2010'),
  6576. ... pd.Timestamp('2011')],
  6577. ... 'C': [pd.Timedelta('1 days'),
  6578. ... pd.Timedelta('2 days')]})
  6579. >>> df.quantile(0.5, numeric_only=False)
  6580. A 1.5
  6581. B 2010-07-02 12:00:00
  6582. C 1 days 12:00:00
  6583. Name: 0.5, dtype: object
  6584. """
  6585. self._check_percentile(q)
  6586. data = self._get_numeric_data() if numeric_only else self
  6587. axis = self._get_axis_number(axis)
  6588. is_transposed = axis == 1
  6589. if is_transposed:
  6590. data = data.T
  6591. result = data._data.quantile(qs=q,
  6592. axis=1,
  6593. interpolation=interpolation,
  6594. transposed=is_transposed)
  6595. if result.ndim == 2:
  6596. result = self._constructor(result)
  6597. else:
  6598. result = self._constructor_sliced(result, name=q)
  6599. if is_transposed:
  6600. result = result.T
  6601. return result
  6602. def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
  6603. """
  6604. Cast to DatetimeIndex of timestamps, at *beginning* of period.
  6605. Parameters
  6606. ----------
  6607. freq : str, default frequency of PeriodIndex
  6608. Desired frequency.
  6609. how : {'s', 'e', 'start', 'end'}
  6610. Convention for converting period to timestamp; start of period
  6611. vs. end.
  6612. axis : {0 or 'index', 1 or 'columns'}, default 0
  6613. The axis to convert (the index by default).
  6614. copy : bool, default True
  6615. If False then underlying input data is not copied.
  6616. Returns
  6617. -------
  6618. DataFrame with DatetimeIndex
  6619. """
  6620. new_data = self._data
  6621. if copy:
  6622. new_data = new_data.copy()
  6623. axis = self._get_axis_number(axis)
  6624. if axis == 0:
  6625. new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how))
  6626. elif axis == 1:
  6627. new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
  6628. else: # pragma: no cover
  6629. raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
  6630. ax=axis))
  6631. return self._constructor(new_data)
  6632. def to_period(self, freq=None, axis=0, copy=True):
  6633. """
  6634. Convert DataFrame from DatetimeIndex to PeriodIndex with desired
  6635. frequency (inferred from index if not passed).
  6636. Parameters
  6637. ----------
  6638. freq : str, default
  6639. Frequency of the PeriodIndex.
  6640. axis : {0 or 'index', 1 or 'columns'}, default 0
  6641. The axis to convert (the index by default).
  6642. copy : bool, default True
  6643. If False then underlying input data is not copied.
  6644. Returns
  6645. -------
  6646. TimeSeries with PeriodIndex
  6647. """
  6648. new_data = self._data
  6649. if copy:
  6650. new_data = new_data.copy()
  6651. axis = self._get_axis_number(axis)
  6652. if axis == 0:
  6653. new_data.set_axis(1, self.index.to_period(freq=freq))
  6654. elif axis == 1:
  6655. new_data.set_axis(0, self.columns.to_period(freq=freq))
  6656. else: # pragma: no cover
  6657. raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
  6658. ax=axis))
  6659. return self._constructor(new_data)
  6660. def isin(self, values):
  6661. """
  6662. Whether each element in the DataFrame is contained in values.
  6663. Parameters
  6664. ----------
  6665. values : iterable, Series, DataFrame or dict
  6666. The result will only be true at a location if all the
  6667. labels match. If `values` is a Series, that's the index. If
  6668. `values` is a dict, the keys must be the column names,
  6669. which must match. If `values` is a DataFrame,
  6670. then both the index and column labels must match.
  6671. Returns
  6672. -------
  6673. DataFrame
  6674. DataFrame of booleans showing whether each element in the DataFrame
  6675. is contained in values.
  6676. See Also
  6677. --------
  6678. DataFrame.eq: Equality test for DataFrame.
  6679. Series.isin: Equivalent method on Series.
  6680. Series.str.contains: Test if pattern or regex is contained within a
  6681. string of a Series or Index.
  6682. Examples
  6683. --------
  6684. >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
  6685. ... index=['falcon', 'dog'])
  6686. >>> df
  6687. num_legs num_wings
  6688. falcon 2 2
  6689. dog 4 0
  6690. When ``values`` is a list check whether every value in the DataFrame
  6691. is present in the list (which animals have 0 or 2 legs or wings)
  6692. >>> df.isin([0, 2])
  6693. num_legs num_wings
  6694. falcon True True
  6695. dog False True
  6696. When ``values`` is a dict, we can pass values to check for each
  6697. column separately:
  6698. >>> df.isin({'num_wings': [0, 3]})
  6699. num_legs num_wings
  6700. falcon False False
  6701. dog False True
  6702. When ``values`` is a Series or DataFrame the index and column must
  6703. match. Note that 'falcon' does not match based on the number of legs
  6704. in df2.
  6705. >>> other = pd.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]},
  6706. ... index=['spider', 'falcon'])
  6707. >>> df.isin(other)
  6708. num_legs num_wings
  6709. falcon True True
  6710. dog False False
  6711. """
  6712. if isinstance(values, dict):
  6713. from pandas.core.reshape.concat import concat
  6714. values = collections.defaultdict(list, values)
  6715. return concat((self.iloc[:, [i]].isin(values[col])
  6716. for i, col in enumerate(self.columns)), axis=1)
  6717. elif isinstance(values, Series):
  6718. if not values.index.is_unique:
  6719. raise ValueError("cannot compute isin with "
  6720. "a duplicate axis.")
  6721. return self.eq(values.reindex_like(self), axis='index')
  6722. elif isinstance(values, DataFrame):
  6723. if not (values.columns.is_unique and values.index.is_unique):
  6724. raise ValueError("cannot compute isin with "
  6725. "a duplicate axis.")
  6726. return self.eq(values.reindex_like(self))
  6727. else:
  6728. if not is_list_like(values):
  6729. raise TypeError("only list-like or dict-like objects are "
  6730. "allowed to be passed to DataFrame.isin(), "
  6731. "you passed a "
  6732. "{0!r}".format(type(values).__name__))
  6733. return DataFrame(
  6734. algorithms.isin(self.values.ravel(),
  6735. values).reshape(self.shape), self.index,
  6736. self.columns)
  6737. # ----------------------------------------------------------------------
  6738. # Add plotting methods to DataFrame
  6739. plot = CachedAccessor("plot", gfx.FramePlotMethods)
  6740. hist = gfx.hist_frame
  6741. boxplot = gfx.boxplot_frame
  6742. DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
  6743. axes_are_reversed=True, aliases={'rows': 0},
  6744. docs={
  6745. 'index': 'The index (row labels) of the DataFrame.',
  6746. 'columns': 'The column labels of the DataFrame.'})
  6747. DataFrame._add_numeric_operations()
  6748. DataFrame._add_series_or_dataframe_operations()
  6749. ops.add_flex_arithmetic_methods(DataFrame)
  6750. ops.add_special_arithmetic_methods(DataFrame)
  6751. def _from_nested_dict(data):
  6752. # TODO: this should be seriously cythonized
  6753. new_data = OrderedDict()
  6754. for index, s in compat.iteritems(data):
  6755. for col, v in compat.iteritems(s):
  6756. new_data[col] = new_data.get(col, OrderedDict())
  6757. new_data[col][index] = v
  6758. return new_data
  6759. def _put_str(s, space):
  6760. return u'{s}'.format(s=s)[:space].ljust(space)