PageRenderTime 68ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/core/frame.py

http://github.com/wesm/pandas
Python | 8037 lines | 8013 code | 10 blank | 14 comment | 24 complexity | f7a69c2d03a0237ebc71af5ae471cb7a MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. # pylint: disable=E1101
  2. # pylint: disable=W0212,W0703,W0622
  3. """
  4. DataFrame
  5. ---------
  6. An efficient 2D container for potentially mixed-type time series or other
  7. labeled data series.
  8. Similar to its R counterpart, data.frame, except providing automatic data
  9. alignment and a host of useful data manipulation methods having to do with the
  10. labeling information
  11. """
  12. from __future__ import division
  13. import collections
  14. from collections import OrderedDict
  15. import functools
  16. import itertools
  17. import sys
  18. import warnings
  19. from textwrap import dedent
  20. import numpy as np
  21. import numpy.ma as ma
  22. from pandas._libs import lib, algos as libalgos
  23. from pandas.util._decorators import (Appender, Substitution,
  24. rewrite_axis_style_signature,
  25. deprecate_kwarg)
  26. from pandas.util._validators import (validate_bool_kwarg,
  27. validate_axis_style_args)
  28. from pandas import compat
  29. from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u,
  30. PY36, raise_with_traceback, Iterator,
  31. string_and_binary_types)
  32. from pandas.compat.numpy import function as nv
  33. from pandas.core.dtypes.cast import (
  34. maybe_upcast,
  35. cast_scalar_to_array,
  36. infer_dtype_from_scalar,
  37. maybe_cast_to_datetime,
  38. maybe_infer_to_datetimelike,
  39. maybe_convert_platform,
  40. maybe_downcast_to_dtype,
  41. invalidate_string_dtypes,
  42. coerce_to_dtypes,
  43. maybe_upcast_putmask,
  44. find_common_type)
  45. from pandas.core.dtypes.common import (
  46. is_dict_like,
  47. is_datetime64tz_dtype,
  48. is_object_dtype,
  49. is_extension_type,
  50. is_extension_array_dtype,
  51. is_datetime64_any_dtype,
  52. is_bool_dtype,
  53. is_integer_dtype,
  54. is_float_dtype,
  55. is_integer,
  56. is_scalar,
  57. is_dtype_equal,
  58. needs_i8_conversion,
  59. infer_dtype_from_object,
  60. ensure_float64,
  61. ensure_int64,
  62. ensure_platform_int,
  63. is_list_like,
  64. is_nested_list_like,
  65. is_iterator,
  66. is_sequence,
  67. is_named_tuple)
  68. from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
  69. from pandas.core.dtypes.missing import isna, notna
  70. from pandas.core import algorithms
  71. from pandas.core import common as com
  72. from pandas.core import nanops
  73. from pandas.core import ops
  74. from pandas.core.accessor import CachedAccessor
  75. from pandas.core.arrays import Categorical, ExtensionArray
  76. from pandas.core.arrays.datetimelike import (
  77. DatetimeLikeArrayMixin as DatetimeLikeArray
  78. )
  79. from pandas.core.config import get_option
  80. from pandas.core.generic import NDFrame, _shared_docs
  81. from pandas.core.index import (Index, MultiIndex, ensure_index,
  82. ensure_index_from_sequences)
  83. from pandas.core.indexes import base as ibase
  84. from pandas.core.indexes.datetimes import DatetimeIndex
  85. from pandas.core.indexes.period import PeriodIndex
  86. from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
  87. check_bool_indexer)
  88. from pandas.core.internals import BlockManager
  89. from pandas.core.internals.construction import (
  90. masked_rec_array_to_mgr, get_names_from_index, to_arrays,
  91. reorder_arrays, init_ndarray, init_dict,
  92. arrays_to_mgr, sanitize_index)
  93. from pandas.core.series import Series
  94. from pandas.io.formats import console
  95. from pandas.io.formats import format as fmt
  96. from pandas.io.formats.printing import pprint_thing
  97. import pandas.plotting._core as gfx
  98. # ---------------------------------------------------------------------
  99. # Docstring templates
  100. _shared_doc_kwargs = dict(
  101. axes='index, columns', klass='DataFrame',
  102. axes_single_arg="{0 or 'index', 1 or 'columns'}",
  103. axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
  104. If 0 or 'index': apply function to each column.
  105. If 1 or 'columns': apply function to each row.""",
  106. optional_by="""
  107. by : str or list of str
  108. Name or list of names to sort by.
  109. - if `axis` is 0 or `'index'` then `by` may contain index
  110. levels and/or column labels
  111. - if `axis` is 1 or `'columns'` then `by` may contain column
  112. levels and/or index labels
  113. .. versionchanged:: 0.23.0
  114. Allow specifying index or column level names.""",
  115. versionadded_to_excel='',
  116. optional_labels="""labels : array-like, optional
  117. New labels / index to conform the axis specified by 'axis' to.""",
  118. optional_axis="""axis : int or str, optional
  119. Axis to target. Can be either the axis name ('index', 'columns')
  120. or number (0, 1).""",
  121. )
  122. _numeric_only_doc = """numeric_only : boolean, default None
  123. Include only float, int, boolean data. If None, will attempt to use
  124. everything, then use only numeric data
  125. """
  126. _merge_doc = """
  127. Merge DataFrame or named Series objects with a database-style join.
  128. The join is done on columns or indexes. If joining columns on
  129. columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
  130. on indexes or indexes on a column or columns, the index will be passed on.
  131. Parameters
  132. ----------%s
  133. right : DataFrame or named Series
  134. Object to merge with.
  135. how : {'left', 'right', 'outer', 'inner'}, default 'inner'
  136. Type of merge to be performed.
  137. * left: use only keys from left frame, similar to a SQL left outer join;
  138. preserve key order.
  139. * right: use only keys from right frame, similar to a SQL right outer join;
  140. preserve key order.
  141. * outer: use union of keys from both frames, similar to a SQL full outer
  142. join; sort keys lexicographically.
  143. * inner: use intersection of keys from both frames, similar to a SQL inner
  144. join; preserve the order of the left keys.
  145. on : label or list
  146. Column or index level names to join on. These must be found in both
  147. DataFrames. If `on` is None and not merging on indexes then this defaults
  148. to the intersection of the columns in both DataFrames.
  149. left_on : label or list, or array-like
  150. Column or index level names to join on in the left DataFrame. Can also
  151. be an array or list of arrays of the length of the left DataFrame.
  152. These arrays are treated as if they are columns.
  153. right_on : label or list, or array-like
  154. Column or index level names to join on in the right DataFrame. Can also
  155. be an array or list of arrays of the length of the right DataFrame.
  156. These arrays are treated as if they are columns.
  157. left_index : bool, default False
  158. Use the index from the left DataFrame as the join key(s). If it is a
  159. MultiIndex, the number of keys in the other DataFrame (either the index
  160. or a number of columns) must match the number of levels.
  161. right_index : bool, default False
  162. Use the index from the right DataFrame as the join key. Same caveats as
  163. left_index.
  164. sort : bool, default False
  165. Sort the join keys lexicographically in the result DataFrame. If False,
  166. the order of the join keys depends on the join type (how keyword).
  167. suffixes : tuple of (str, str), default ('_x', '_y')
  168. Suffix to apply to overlapping column names in the left and right
  169. side, respectively. To raise an exception on overlapping columns use
  170. (False, False).
  171. copy : bool, default True
  172. If False, avoid copy if possible.
  173. indicator : bool or str, default False
  174. If True, adds a column to output DataFrame called "_merge" with
  175. information on the source of each row.
  176. If string, column with information on source of each row will be added to
  177. output DataFrame, and column will be named value of string.
  178. Information column is Categorical-type and takes on a value of "left_only"
  179. for observations whose merge key only appears in 'left' DataFrame,
  180. "right_only" for observations whose merge key only appears in 'right'
  181. DataFrame, and "both" if the observation's merge key is found in both.
  182. validate : str, optional
  183. If specified, checks if merge is of specified type.
  184. * "one_to_one" or "1:1": check if merge keys are unique in both
  185. left and right datasets.
  186. * "one_to_many" or "1:m": check if merge keys are unique in left
  187. dataset.
  188. * "many_to_one" or "m:1": check if merge keys are unique in right
  189. dataset.
  190. * "many_to_many" or "m:m": allowed, but does not result in checks.
  191. .. versionadded:: 0.21.0
  192. Returns
  193. -------
  194. DataFrame
  195. A DataFrame of the two merged objects.
  196. See Also
  197. --------
  198. merge_ordered : Merge with optional filling/interpolation.
  199. merge_asof : Merge on nearest keys.
  200. DataFrame.join : Similar method using indices.
  201. Notes
  202. -----
  203. Support for specifying index levels as the `on`, `left_on`, and
  204. `right_on` parameters was added in version 0.23.0
  205. Support for merging named Series objects was added in version 0.24.0
  206. Examples
  207. --------
  208. >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
  209. ... 'value': [1, 2, 3, 5]})
  210. >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
  211. ... 'value': [5, 6, 7, 8]})
  212. >>> df1
  213. lkey value
  214. 0 foo 1
  215. 1 bar 2
  216. 2 baz 3
  217. 3 foo 5
  218. >>> df2
  219. rkey value
  220. 0 foo 5
  221. 1 bar 6
  222. 2 baz 7
  223. 3 foo 8
  224. Merge df1 and df2 on the lkey and rkey columns. The value columns have
  225. the default suffixes, _x and _y, appended.
  226. >>> df1.merge(df2, left_on='lkey', right_on='rkey')
  227. lkey value_x rkey value_y
  228. 0 foo 1 foo 5
  229. 1 foo 1 foo 8
  230. 2 foo 5 foo 5
  231. 3 foo 5 foo 8
  232. 4 bar 2 bar 6
  233. 5 baz 3 baz 7
  234. Merge DataFrames df1 and df2 with specified left and right suffixes
  235. appended to any overlapping columns.
  236. >>> df1.merge(df2, left_on='lkey', right_on='rkey',
  237. ... suffixes=('_left', '_right'))
  238. lkey value_left rkey value_right
  239. 0 foo 1 foo 5
  240. 1 foo 1 foo 8
  241. 2 foo 5 foo 5
  242. 3 foo 5 foo 8
  243. 4 bar 2 bar 6
  244. 5 baz 3 baz 7
  245. Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
  246. any overlapping columns.
  247. >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
  248. Traceback (most recent call last):
  249. ...
  250. ValueError: columns overlap but no suffix specified:
  251. Index(['value'], dtype='object')
  252. """
  253. # -----------------------------------------------------------------------
  254. # DataFrame class
  255. class DataFrame(NDFrame):
  256. """
  257. Two-dimensional size-mutable, potentially heterogeneous tabular data
  258. structure with labeled axes (rows and columns). Arithmetic operations
  259. align on both row and column labels. Can be thought of as a dict-like
  260. container for Series objects. The primary pandas data structure.
  261. Parameters
  262. ----------
  263. data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
  264. Dict can contain Series, arrays, constants, or list-like objects
  265. .. versionchanged :: 0.23.0
  266. If data is a dict, argument order is maintained for Python 3.6
  267. and later.
  268. index : Index or array-like
  269. Index to use for resulting frame. Will default to RangeIndex if
  270. no indexing information part of input data and no index provided
  271. columns : Index or array-like
  272. Column labels to use for resulting frame. Will default to
  273. RangeIndex (0, 1, 2, ..., n) if no column labels are provided
  274. dtype : dtype, default None
  275. Data type to force. Only a single dtype is allowed. If None, infer
  276. copy : boolean, default False
  277. Copy data from inputs. Only affects DataFrame / 2d ndarray input
  278. See Also
  279. --------
  280. DataFrame.from_records : Constructor from tuples, also record arrays.
  281. DataFrame.from_dict : From dicts of Series, arrays, or dicts.
  282. DataFrame.from_items : From sequence of (key, value) pairs
  283. read_csv, pandas.read_table, pandas.read_clipboard.
  284. Examples
  285. --------
  286. Constructing DataFrame from a dictionary.
  287. >>> d = {'col1': [1, 2], 'col2': [3, 4]}
  288. >>> df = pd.DataFrame(data=d)
  289. >>> df
  290. col1 col2
  291. 0 1 3
  292. 1 2 4
  293. Notice that the inferred dtype is int64.
  294. >>> df.dtypes
  295. col1 int64
  296. col2 int64
  297. dtype: object
  298. To enforce a single dtype:
  299. >>> df = pd.DataFrame(data=d, dtype=np.int8)
  300. >>> df.dtypes
  301. col1 int8
  302. col2 int8
  303. dtype: object
  304. Constructing DataFrame from numpy ndarray:
  305. >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
  306. ... columns=['a', 'b', 'c'])
  307. >>> df2
  308. a b c
  309. 0 1 2 3
  310. 1 4 5 6
  311. 2 7 8 9
  312. """
  313. @property
  314. def _constructor(self):
  315. return DataFrame
  316. _constructor_sliced = Series
  317. _deprecations = NDFrame._deprecations | frozenset(
  318. ['get_value', 'set_value', 'from_csv', 'from_items'])
  319. _accessors = set()
  320. @property
  321. def _constructor_expanddim(self):
  322. from pandas.core.panel import Panel
  323. return Panel
  324. # ----------------------------------------------------------------------
  325. # Constructors
  326. def __init__(self, data=None, index=None, columns=None, dtype=None,
  327. copy=False):
  328. if data is None:
  329. data = {}
  330. if dtype is not None:
  331. dtype = self._validate_dtype(dtype)
  332. if isinstance(data, DataFrame):
  333. data = data._data
  334. if isinstance(data, BlockManager):
  335. mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
  336. dtype=dtype, copy=copy)
  337. elif isinstance(data, dict):
  338. mgr = init_dict(data, index, columns, dtype=dtype)
  339. elif isinstance(data, ma.MaskedArray):
  340. import numpy.ma.mrecords as mrecords
  341. # masked recarray
  342. if isinstance(data, mrecords.MaskedRecords):
  343. mgr = masked_rec_array_to_mgr(data, index, columns, dtype,
  344. copy)
  345. # a masked array
  346. else:
  347. mask = ma.getmaskarray(data)
  348. if mask.any():
  349. data, fill_value = maybe_upcast(data, copy=True)
  350. data.soften_mask() # set hardmask False if it was True
  351. data[mask] = fill_value
  352. else:
  353. data = data.copy()
  354. mgr = init_ndarray(data, index, columns, dtype=dtype,
  355. copy=copy)
  356. elif isinstance(data, (np.ndarray, Series, Index)):
  357. if data.dtype.names:
  358. data_columns = list(data.dtype.names)
  359. data = {k: data[k] for k in data_columns}
  360. if columns is None:
  361. columns = data_columns
  362. mgr = init_dict(data, index, columns, dtype=dtype)
  363. elif getattr(data, 'name', None) is not None:
  364. mgr = init_dict({data.name: data}, index, columns,
  365. dtype=dtype)
  366. else:
  367. mgr = init_ndarray(data, index, columns, dtype=dtype,
  368. copy=copy)
  369. # For data is list-like, or Iterable (will consume into list)
  370. elif (isinstance(data, compat.Iterable)
  371. and not isinstance(data, string_and_binary_types)):
  372. if not isinstance(data, compat.Sequence):
  373. data = list(data)
  374. if len(data) > 0:
  375. if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
  376. if is_named_tuple(data[0]) and columns is None:
  377. columns = data[0]._fields
  378. arrays, columns = to_arrays(data, columns, dtype=dtype)
  379. columns = ensure_index(columns)
  380. # set the index
  381. if index is None:
  382. if isinstance(data[0], Series):
  383. index = get_names_from_index(data)
  384. elif isinstance(data[0], Categorical):
  385. index = ibase.default_index(len(data[0]))
  386. else:
  387. index = ibase.default_index(len(data))
  388. mgr = arrays_to_mgr(arrays, columns, index, columns,
  389. dtype=dtype)
  390. else:
  391. mgr = init_ndarray(data, index, columns, dtype=dtype,
  392. copy=copy)
  393. else:
  394. mgr = init_dict({}, index, columns, dtype=dtype)
  395. else:
  396. try:
  397. arr = np.array(data, dtype=dtype, copy=copy)
  398. except (ValueError, TypeError) as e:
  399. exc = TypeError('DataFrame constructor called with '
  400. 'incompatible data and dtype: {e}'.format(e=e))
  401. raise_with_traceback(exc)
  402. if arr.ndim == 0 and index is not None and columns is not None:
  403. values = cast_scalar_to_array((len(index), len(columns)),
  404. data, dtype=dtype)
  405. mgr = init_ndarray(values, index, columns,
  406. dtype=values.dtype, copy=False)
  407. else:
  408. raise ValueError('DataFrame constructor not properly called!')
  409. NDFrame.__init__(self, mgr, fastpath=True)
  410. # ----------------------------------------------------------------------
  411. @property
  412. def axes(self):
  413. """
  414. Return a list representing the axes of the DataFrame.
  415. It has the row axis labels and column axis labels as the only members.
  416. They are returned in that order.
  417. Examples
  418. --------
  419. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  420. >>> df.axes
  421. [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
  422. dtype='object')]
  423. """
  424. return [self.index, self.columns]
  425. @property
  426. def shape(self):
  427. """
  428. Return a tuple representing the dimensionality of the DataFrame.
  429. See Also
  430. --------
  431. ndarray.shape
  432. Examples
  433. --------
  434. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  435. >>> df.shape
  436. (2, 2)
  437. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
  438. ... 'col3': [5, 6]})
  439. >>> df.shape
  440. (2, 3)
  441. """
  442. return len(self.index), len(self.columns)
  443. @property
  444. def _is_homogeneous_type(self):
  445. """
  446. Whether all the columns in a DataFrame have the same type.
  447. Returns
  448. -------
  449. bool
  450. Examples
  451. --------
  452. >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
  453. True
  454. >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
  455. False
  456. Items with the same type but different sizes are considered
  457. different types.
  458. >>> DataFrame({
  459. ... "A": np.array([1, 2], dtype=np.int32),
  460. ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
  461. False
  462. """
  463. if self._data.any_extension_types:
  464. return len({block.dtype for block in self._data.blocks}) == 1
  465. else:
  466. return not self._data.is_mixed_type
  467. # ----------------------------------------------------------------------
  468. # Rendering Methods
  469. def _repr_fits_vertical_(self):
  470. """
  471. Check length against max_rows.
  472. """
  473. max_rows = get_option("display.max_rows")
  474. return len(self) <= max_rows
  475. def _repr_fits_horizontal_(self, ignore_width=False):
  476. """
  477. Check if full repr fits in horizontal boundaries imposed by the display
  478. options width and max_columns.
  479. In case off non-interactive session, no boundaries apply.
  480. `ignore_width` is here so ipnb+HTML output can behave the way
  481. users expect. display.max_columns remains in effect.
  482. GH3541, GH3573
  483. """
  484. width, height = console.get_console_size()
  485. max_columns = get_option("display.max_columns")
  486. nb_columns = len(self.columns)
  487. # exceed max columns
  488. if ((max_columns and nb_columns > max_columns) or
  489. ((not ignore_width) and width and nb_columns > (width // 2))):
  490. return False
  491. # used by repr_html under IPython notebook or scripts ignore terminal
  492. # dims
  493. if ignore_width or not console.in_interactive_session():
  494. return True
  495. if (get_option('display.width') is not None or
  496. console.in_ipython_frontend()):
  497. # check at least the column row for excessive width
  498. max_rows = 1
  499. else:
  500. max_rows = get_option("display.max_rows")
  501. # when auto-detecting, so width=None and not in ipython front end
  502. # check whether repr fits horizontal by actually checking
  503. # the width of the rendered repr
  504. buf = StringIO()
  505. # only care about the stuff we'll actually print out
  506. # and to_string on entire frame may be expensive
  507. d = self
  508. if not (max_rows is None): # unlimited rows
  509. # min of two, where one may be None
  510. d = d.iloc[:min(max_rows, len(d))]
  511. else:
  512. return True
  513. d.to_string(buf=buf)
  514. value = buf.getvalue()
  515. repr_width = max(len(l) for l in value.split('\n'))
  516. return repr_width < width
  517. def _info_repr(self):
  518. """
  519. True if the repr should show the info view.
  520. """
  521. info_repr_option = (get_option("display.large_repr") == "info")
  522. return info_repr_option and not (self._repr_fits_horizontal_() and
  523. self._repr_fits_vertical_())
  524. def __unicode__(self):
  525. """
  526. Return a string representation for a particular DataFrame.
  527. Invoked by unicode(df) in py2 only. Yields a Unicode String in both
  528. py2/py3.
  529. """
  530. buf = StringIO(u(""))
  531. if self._info_repr():
  532. self.info(buf=buf)
  533. return buf.getvalue()
  534. max_rows = get_option("display.max_rows")
  535. max_cols = get_option("display.max_columns")
  536. show_dimensions = get_option("display.show_dimensions")
  537. if get_option("display.expand_frame_repr"):
  538. width, _ = console.get_console_size()
  539. else:
  540. width = None
  541. self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
  542. line_width=width, show_dimensions=show_dimensions)
  543. return buf.getvalue()
  544. def _repr_html_(self):
  545. """
  546. Return a html representation for a particular DataFrame.
  547. Mainly for IPython notebook.
  548. """
  549. if self._info_repr():
  550. buf = StringIO(u(""))
  551. self.info(buf=buf)
  552. # need to escape the <class>, should be the first line.
  553. val = buf.getvalue().replace('<', r'&lt;', 1)
  554. val = val.replace('>', r'&gt;', 1)
  555. return '<pre>' + val + '</pre>'
  556. if get_option("display.notebook_repr_html"):
  557. max_rows = get_option("display.max_rows")
  558. max_cols = get_option("display.max_columns")
  559. show_dimensions = get_option("display.show_dimensions")
  560. return self.to_html(max_rows=max_rows, max_cols=max_cols,
  561. show_dimensions=show_dimensions, notebook=True)
  562. else:
  563. return None
  564. @Substitution(header='Write out the column names. If a list of strings '
  565. 'is given, it is assumed to be aliases for the '
  566. 'column names')
  567. @Substitution(shared_params=fmt.common_docstring,
  568. returns=fmt.return_docstring)
  569. def to_string(self, buf=None, columns=None, col_space=None, header=True,
  570. index=True, na_rep='NaN', formatters=None, float_format=None,
  571. sparsify=None, index_names=True, justify=None,
  572. max_rows=None, max_cols=None, show_dimensions=False,
  573. decimal='.', line_width=None):
  574. """
  575. Render a DataFrame to a console-friendly tabular output.
  576. %(shared_params)s
  577. line_width : int, optional
  578. Width to wrap a line in characters.
  579. %(returns)s
  580. See Also
  581. --------
  582. to_html : Convert DataFrame to HTML.
  583. Examples
  584. --------
  585. >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
  586. >>> df = pd.DataFrame(d)
  587. >>> print(df.to_string())
  588. col1 col2
  589. 0 1 4
  590. 1 2 5
  591. 2 3 6
  592. """
  593. formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
  594. col_space=col_space, na_rep=na_rep,
  595. formatters=formatters,
  596. float_format=float_format,
  597. sparsify=sparsify, justify=justify,
  598. index_names=index_names,
  599. header=header, index=index,
  600. max_rows=max_rows,
  601. max_cols=max_cols,
  602. show_dimensions=show_dimensions,
  603. decimal=decimal,
  604. line_width=line_width)
  605. formatter.to_string()
  606. if buf is None:
  607. result = formatter.buf.getvalue()
  608. return result
  609. # ----------------------------------------------------------------------
  610. @property
  611. def style(self):
  612. """
  613. Property returning a Styler object containing methods for
  614. building a styled HTML representation fo the DataFrame.
  615. See Also
  616. --------
  617. io.formats.style.Styler
  618. """
  619. from pandas.io.formats.style import Styler
  620. return Styler(self)
  621. def iteritems(self):
  622. r"""
  623. Iterator over (column name, Series) pairs.
  624. Iterates over the DataFrame columns, returning a tuple with
  625. the column name and the content as a Series.
  626. Yields
  627. ------
  628. label : object
  629. The column names for the DataFrame being iterated over.
  630. content : Series
  631. The column entries belonging to each label, as a Series.
  632. See Also
  633. --------
  634. DataFrame.iterrows : Iterate over DataFrame rows as
  635. (index, Series) pairs.
  636. DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
  637. of the values.
  638. Examples
  639. --------
  640. >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
  641. ... 'population': [1864, 22000, 80000]},
  642. ... index=['panda', 'polar', 'koala'])
  643. >>> df
  644. species population
  645. panda bear 1864
  646. polar bear 22000
  647. koala marsupial 80000
  648. >>> for label, content in df.iteritems():
  649. ... print('label:', label)
  650. ... print('content:', content, sep='\n')
  651. ...
  652. label: species
  653. content:
  654. panda bear
  655. polar bear
  656. koala marsupial
  657. Name: species, dtype: object
  658. label: population
  659. content:
  660. panda 1864
  661. polar 22000
  662. koala 80000
  663. Name: population, dtype: int64
  664. """
  665. if self.columns.is_unique and hasattr(self, '_item_cache'):
  666. for k in self.columns:
  667. yield k, self._get_item_cache(k)
  668. else:
  669. for i, k in enumerate(self.columns):
  670. yield k, self._ixs(i, axis=1)
  671. def iterrows(self):
  672. """
  673. Iterate over DataFrame rows as (index, Series) pairs.
  674. Yields
  675. ------
  676. index : label or tuple of label
  677. The index of the row. A tuple for a `MultiIndex`.
  678. data : Series
  679. The data of the row as a Series.
  680. it : generator
  681. A generator that iterates over the rows of the frame.
  682. See Also
  683. --------
  684. itertuples : Iterate over DataFrame rows as namedtuples of the values.
  685. iteritems : Iterate over (column name, Series) pairs.
  686. Notes
  687. -----
  688. 1. Because ``iterrows`` returns a Series for each row,
  689. it does **not** preserve dtypes across the rows (dtypes are
  690. preserved across columns for DataFrames). For example,
  691. >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
  692. >>> row = next(df.iterrows())[1]
  693. >>> row
  694. int 1.0
  695. float 1.5
  696. Name: 0, dtype: float64
  697. >>> print(row['int'].dtype)
  698. float64
  699. >>> print(df['int'].dtype)
  700. int64
  701. To preserve dtypes while iterating over the rows, it is better
  702. to use :meth:`itertuples` which returns namedtuples of the values
  703. and which is generally faster than ``iterrows``.
  704. 2. You should **never modify** something you are iterating over.
  705. This is not guaranteed to work in all cases. Depending on the
  706. data types, the iterator returns a copy and not a view, and writing
  707. to it will have no effect.
  708. """
  709. columns = self.columns
  710. klass = self._constructor_sliced
  711. for k, v in zip(self.index, self.values):
  712. s = klass(v, index=columns, name=k)
  713. yield k, s
  714. def itertuples(self, index=True, name="Pandas"):
  715. """
  716. Iterate over DataFrame rows as namedtuples.
  717. Parameters
  718. ----------
  719. index : bool, default True
  720. If True, return the index as the first element of the tuple.
  721. name : str or None, default "Pandas"
  722. The name of the returned namedtuples or None to return regular
  723. tuples.
  724. Yields
  725. -------
  726. collections.namedtuple
  727. Yields a namedtuple for each row in the DataFrame with the first
  728. field possibly being the index and following fields being the
  729. column values.
  730. See Also
  731. --------
  732. DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
  733. pairs.
  734. DataFrame.iteritems : Iterate over (column name, Series) pairs.
  735. Notes
  736. -----
  737. The column names will be renamed to positional names if they are
  738. invalid Python identifiers, repeated, or start with an underscore.
  739. With a large number of columns (>255), regular tuples are returned.
  740. Examples
  741. --------
  742. >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
  743. ... index=['dog', 'hawk'])
  744. >>> df
  745. num_legs num_wings
  746. dog 4 0
  747. hawk 2 2
  748. >>> for row in df.itertuples():
  749. ... print(row)
  750. ...
  751. Pandas(Index='dog', num_legs=4, num_wings=0)
  752. Pandas(Index='hawk', num_legs=2, num_wings=2)
  753. By setting the `index` parameter to False we can remove the index
  754. as the first element of the tuple:
  755. >>> for row in df.itertuples(index=False):
  756. ... print(row)
  757. ...
  758. Pandas(num_legs=4, num_wings=0)
  759. Pandas(num_legs=2, num_wings=2)
  760. With the `name` parameter set we set a custom name for the yielded
  761. namedtuples:
  762. >>> for row in df.itertuples(name='Animal'):
  763. ... print(row)
  764. ...
  765. Animal(Index='dog', num_legs=4, num_wings=0)
  766. Animal(Index='hawk', num_legs=2, num_wings=2)
  767. """
  768. arrays = []
  769. fields = list(self.columns)
  770. if index:
  771. arrays.append(self.index)
  772. fields.insert(0, "Index")
  773. # use integer indexing because of possible duplicate column names
  774. arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
  775. # Python 3 supports at most 255 arguments to constructor, and
  776. # things get slow with this many fields in Python 2
  777. if name is not None and len(self.columns) + index < 256:
  778. # `rename` is unsupported in Python 2.6
  779. try:
  780. itertuple = collections.namedtuple(name, fields, rename=True)
  781. return map(itertuple._make, zip(*arrays))
  782. except Exception:
  783. pass
  784. # fallback to regular tuples
  785. return zip(*arrays)
  786. items = iteritems
  787. def __len__(self):
  788. """
  789. Returns length of info axis, but here we use the index.
  790. """
  791. return len(self.index)
  792. def dot(self, other):
  793. """
  794. Compute the matrix mutiplication between the DataFrame and other.
  795. This method computes the matrix product between the DataFrame and the
  796. values of an other Series, DataFrame or a numpy array.
  797. It can also be called using ``self @ other`` in Python >= 3.5.
  798. Parameters
  799. ----------
  800. other : Series, DataFrame or array-like
  801. The other object to compute the matrix product with.
  802. Returns
  803. -------
  804. Series or DataFrame
  805. If other is a Series, return the matrix product between self and
  806. other as a Serie. If other is a DataFrame or a numpy.array, return
  807. the matrix product of self and other in a DataFrame of a np.array.
  808. See Also
  809. --------
  810. Series.dot: Similar method for Series.
  811. Notes
  812. -----
  813. The dimensions of DataFrame and other must be compatible in order to
  814. compute the matrix multiplication.
  815. The dot method for Series computes the inner product, instead of the
  816. matrix product here.
  817. Examples
  818. --------
  819. Here we multiply a DataFrame with a Series.
  820. >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
  821. >>> s = pd.Series([1, 1, 2, 1])
  822. >>> df.dot(s)
  823. 0 -4
  824. 1 5
  825. dtype: int64
  826. Here we multiply a DataFrame with another DataFrame.
  827. >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
  828. >>> df.dot(other)
  829. 0 1
  830. 0 1 4
  831. 1 2 2
  832. Note that the dot method give the same result as @
  833. >>> df @ other
  834. 0 1
  835. 0 1 4
  836. 1 2 2
  837. The dot method works also if other is an np.array.
  838. >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
  839. >>> df.dot(arr)
  840. 0 1
  841. 0 1 4
  842. 1 2 2
  843. """
  844. if isinstance(other, (Series, DataFrame)):
  845. common = self.columns.union(other.index)
  846. if (len(common) > len(self.columns) or
  847. len(common) > len(other.index)):
  848. raise ValueError('matrices are not aligned')
  849. left = self.reindex(columns=common, copy=False)
  850. right = other.reindex(index=common, copy=False)
  851. lvals = left.values
  852. rvals = right.values
  853. else:
  854. left = self
  855. lvals = self.values
  856. rvals = np.asarray(other)
  857. if lvals.shape[1] != rvals.shape[0]:
  858. raise ValueError('Dot product shape mismatch, '
  859. '{s} vs {r}'.format(s=lvals.shape,
  860. r=rvals.shape))
  861. if isinstance(other, DataFrame):
  862. return self._constructor(np.dot(lvals, rvals), index=left.index,
  863. columns=other.columns)
  864. elif isinstance(other, Series):
  865. return Series(np.dot(lvals, rvals), index=left.index)
  866. elif isinstance(rvals, (np.ndarray, Index)):
  867. result = np.dot(lvals, rvals)
  868. if result.ndim == 2:
  869. return self._constructor(result, index=left.index)
  870. else:
  871. return Series(result, index=left.index)
  872. else: # pragma: no cover
  873. raise TypeError('unsupported type: {oth}'.format(oth=type(other)))
  874. def __matmul__(self, other):
  875. """
  876. Matrix multiplication using binary `@` operator in Python>=3.5.
  877. """
  878. return self.dot(other)
  879. def __rmatmul__(self, other):
  880. """
  881. Matrix multiplication using binary `@` operator in Python>=3.5.
  882. """
  883. return self.T.dot(np.transpose(other)).T
  884. # ----------------------------------------------------------------------
  885. # IO methods (to / from other formats)
  886. @classmethod
  887. def from_dict(cls, data, orient='columns', dtype=None, columns=None):
  888. """
  889. Construct DataFrame from dict of array-like or dicts.
  890. Creates DataFrame object from dictionary by columns or by index
  891. allowing dtype specification.
  892. Parameters
  893. ----------
  894. data : dict
  895. Of the form {field : array-like} or {field : dict}.
  896. orient : {'columns', 'index'}, default 'columns'
  897. The "orientation" of the data. If the keys of the passed dict
  898. should be the columns of the resulting DataFrame, pass 'columns'
  899. (default). Otherwise if the keys should be rows, pass 'index'.
  900. dtype : dtype, default None
  901. Data type to force, otherwise infer.
  902. columns : list, default None
  903. Column labels to use when ``orient='index'``. Raises a ValueError
  904. if used with ``orient='columns'``.
  905. .. versionadded:: 0.23.0
  906. Returns
  907. -------
  908. DataFrame
  909. See Also
  910. --------
  911. DataFrame.from_records : DataFrame from ndarray (structured
  912. dtype), list of tuples, dict, or DataFrame.
  913. DataFrame : DataFrame object creation using constructor.
  914. Examples
  915. --------
  916. By default the keys of the dict become the DataFrame columns:
  917. >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
  918. >>> pd.DataFrame.from_dict(data)
  919. col_1 col_2
  920. 0 3 a
  921. 1 2 b
  922. 2 1 c
  923. 3 0 d
  924. Specify ``orient='index'`` to create the DataFrame using dictionary
  925. keys as rows:
  926. >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
  927. >>> pd.DataFrame.from_dict(data, orient='index')
  928. 0 1 2 3
  929. row_1 3 2 1 0
  930. row_2 a b c d
  931. When using the 'index' orientation, the column names can be
  932. specified manually:
  933. >>> pd.DataFrame.from_dict(data, orient='index',
  934. ... columns=['A', 'B', 'C', 'D'])
  935. A B C D
  936. row_1 3 2 1 0
  937. row_2 a b c d
  938. """
  939. index = None
  940. orient = orient.lower()
  941. if orient == 'index':
  942. if len(data) > 0:
  943. # TODO speed up Series case
  944. if isinstance(list(data.values())[0], (Series, dict)):
  945. data = _from_nested_dict(data)
  946. else:
  947. data, index = list(data.values()), list(data.keys())
  948. elif orient == 'columns':
  949. if columns is not None:
  950. raise ValueError("cannot use columns parameter with "
  951. "orient='columns'")
  952. else: # pragma: no cover
  953. raise ValueError('only recognize index or columns for orient')
  954. return cls(data, index=index, columns=columns, dtype=dtype)
  955. def to_numpy(self, dtype=None, copy=False):
  956. """
  957. Convert the DataFrame to a NumPy array.
  958. .. versionadded:: 0.24.0
  959. By default, the dtype of the returned array will be the common NumPy
  960. dtype of all types in the DataFrame. For example, if the dtypes are
  961. ``float16`` and ``float32``, the results dtype will be ``float32``.
  962. This may require copying data and coercing values, which may be
  963. expensive.
  964. Parameters
  965. ----------
  966. dtype : str or numpy.dtype, optional
  967. The dtype to pass to :meth:`numpy.asarray`
  968. copy : bool, default False
  969. Whether to ensure that the returned value is a not a view on
  970. another array. Note that ``copy=False`` does not *ensure* that
  971. ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
  972. a copy is made, even if not strictly necessary.
  973. Returns
  974. -------
  975. numpy.ndarray
  976. See Also
  977. --------
  978. Series.to_numpy : Similar method for Series.
  979. Examples
  980. --------
  981. >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
  982. array([[1, 3],
  983. [2, 4]])
  984. With heterogenous data, the lowest common type will have to
  985. be used.
  986. >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
  987. >>> df.to_numpy()
  988. array([[1. , 3. ],
  989. [2. , 4.5]])
  990. For a mix of numeric and non-numeric types, the output array will
  991. have object dtype.
  992. >>> df['C'] = pd.date_range('2000', periods=2)
  993. >>> df.to_numpy()
  994. array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
  995. [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
  996. """
  997. result = np.array(self.values, dtype=dtype, copy=copy)
  998. return result
  999. def to_dict(self, orient='dict', into=dict):
  1000. """
  1001. Convert the DataFrame to a dictionary.
  1002. The type of the key-value pairs can be customized with the parameters
  1003. (see below).
  1004. Parameters
  1005. ----------
  1006. orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
  1007. Determines the type of the values of the dictionary.
  1008. - 'dict' (default) : dict like {column -> {index -> value}}
  1009. - 'list' : dict like {column -> [values]}
  1010. - 'series' : dict like {column -> Series(values)}
  1011. - 'split' : dict like
  1012. {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
  1013. - 'records' : list like
  1014. [{column -> value}, ... , {column -> value}]
  1015. - 'index' : dict like {index -> {column -> value}}
  1016. Abbreviations are allowed. `s` indicates `series` and `sp`
  1017. indicates `split`.
  1018. into : class, default dict
  1019. The collections.Mapping subclass used for all Mappings
  1020. in the return value. Can be the actual class or an empty
  1021. instance of the mapping type you want. If you want a
  1022. collections.defaultdict, you must pass it initialized.
  1023. .. versionadded:: 0.21.0
  1024. Returns
  1025. -------
  1026. dict, list or collections.Mapping
  1027. Return a collections.Mapping object representing the DataFrame.
  1028. The resulting transformation depends on the `orient` parameter.
  1029. See Also
  1030. --------
  1031. DataFrame.from_dict: Create a DataFrame from a dictionary.
  1032. DataFrame.to_json: Convert a DataFrame to JSON format.
  1033. Examples
  1034. --------
  1035. >>> df = pd.DataFrame({'col1': [1, 2],
  1036. ... 'col2': [0.5, 0.75]},
  1037. ... index=['row1', 'row2'])
  1038. >>> df
  1039. col1 col2
  1040. row1 1 0.50
  1041. row2 2 0.75
  1042. >>> df.to_dict()
  1043. {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
  1044. You can specify the return orientation.
  1045. >>> df.to_dict('series')
  1046. {'col1': row1 1
  1047. row2 2
  1048. Name: col1, dtype: int64,
  1049. 'col2': row1 0.50
  1050. row2 0.75
  1051. Name: col2, dtype: float64}
  1052. >>> df.to_dict('split')
  1053. {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
  1054. 'data': [[1, 0.5], [2, 0.75]]}
  1055. >>> df.to_dict('records')
  1056. [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
  1057. >>> df.to_dict('index')
  1058. {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
  1059. You can also specify the mapping type.
  1060. >>> from collections import OrderedDict, defaultdict
  1061. >>> df.to_dict(into=OrderedDict)
  1062. OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
  1063. ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
  1064. If you want a `defaultdict`, you need to initialize it:
  1065. >>> dd = defaultdict(list)
  1066. >>> df.to_dict('records', into=dd)
  1067. [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
  1068. defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
  1069. """
  1070. if not self.columns.is_unique:
  1071. warnings.warn("DataFrame columns are not unique, some "
  1072. "columns will be omitted.", UserWarning,
  1073. stacklevel=2)
  1074. # GH16122
  1075. into_c = com.standardize_mapping(into)
  1076. if orient.lower().startswith('d'):
  1077. return into_c(
  1078. (k, v.to_dict(into)) for k, v in compat.iteritems(self))
  1079. elif orient.lower().startswith('l'):
  1080. return into_c((k, v.tolist()) for k, v in compat.iteritems(self))
  1081. elif orient.lower().startswith('sp'):
  1082. return into_c((('index', self.index.tolist()),
  1083. ('columns', self.columns.tolist()),
  1084. ('data', [
  1085. list(map(com.maybe_box_datetimelike, t))
  1086. for t in self.itertuples(index=False, name=None)
  1087. ])))
  1088. elif orient.lower().startswith('s'):
  1089. return into_c((k, com.maybe_box_datetimelike(v))
  1090. for k, v in compat.iteritems(self))
  1091. elif orient.lower().startswith('r'):
  1092. columns = self.columns.tolist()
  1093. rows = (dict(zip(columns, row))
  1094. for row in self.itertuples(index=False, name=None))
  1095. return [
  1096. into_c((k, com.maybe_box_datetimelike(v))
  1097. for k, v in compat.iteritems(row))
  1098. for row in rows]
  1099. elif orient.lower().startswith('i'):
  1100. if not self.index.is_unique:
  1101. raise ValueError(
  1102. "DataFrame index must be unique for orient='index'."
  1103. )
  1104. return into_c((t[0], dict(zip(self.columns, t[1:])))
  1105. for t in self.itertuples(name=None))
  1106. else:
  1107. raise ValueError("orient '{o}' not understood".format(o=orient))
  1108. def to_gbq(self, destination_table, project_id=None, chunksize=None,
  1109. reauth=False, if_exists='fail', auth_local_webserver=False,
  1110. table_schema=None, location=None, progress_bar=True,
  1111. credentials=None, verbose=None, private_key=None):
  1112. """
  1113. Write a DataFrame to a Google BigQuery table.
  1114. This function requires the `pandas-gbq package
  1115. <https://pandas-gbq.readthedocs.io>`__.
  1116. See the `How to authenticate with Google BigQuery
  1117. <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
  1118. guide for authentication instructions.
  1119. Parameters
  1120. ----------
  1121. destination_table : str
  1122. Name of table to be written, in the form ``dataset.tablename``.
  1123. project_id : str, optional
  1124. Google BigQuery Account project ID. Optional when available from
  1125. the environment.
  1126. chunksize : int, optional
  1127. Number of rows to be inserted in each chunk from the dataframe.
  1128. Set to ``None`` to load the whole dataframe at once.
  1129. reauth : bool, default False
  1130. Force Google BigQuery to re-authenticate the user. This is useful
  1131. if multiple accounts are used.
  1132. if_exists : str, default 'fail'
  1133. Behavior when the destination table exists. Value can be one of:
  1134. ``'fail'``
  1135. If table exists, do nothing.
  1136. ``'replace'``
  1137. If table exists, drop it, recreate it, and insert data.
  1138. ``'append'``
  1139. If table exists, insert data. Create if does not exist.
  1140. auth_local_webserver : bool, default False
  1141. Use the `local webserver flow`_ instead of the `console flow`_
  1142. when getting user credentials.
  1143. .. _local webserver flow:
  1144. http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
  1145. .. _console flow:
  1146. http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
  1147. *New in version 0.2.0 of pandas-gbq*.
  1148. table_schema : list of dicts, optional
  1149. List of BigQuery table fields to which according DataFrame
  1150. columns conform to, e.g. ``[{'name': 'col1', 'type':
  1151. 'STRING'},...]``. If schema is not provided, it will be
  1152. generated according to dtypes of DataFrame columns. See
  1153. BigQuery API documentation on available names of a field.
  1154. *New in version 0.3.1 of pandas-gbq*.
  1155. location : str, optional
  1156. Location where the load job should run. See the `BigQuery locations
  1157. documentation
  1158. <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
  1159. list of available locations. The location must match that of the
  1160. target dataset.
  1161. *New in version 0.5.0 of pandas-gbq*.
  1162. progress_bar : bool, default True
  1163. Use the library `tqdm` to show the progress bar for the upload,
  1164. chunk by chunk.
  1165. *New in version 0.5.0 of pandas-gbq*.
  1166. credentials : google.auth.credentials.Credentials, optional
  1167. Credentials for a

Large files files are truncated, but you can click here to view the full file