PageRenderTime 67ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/core/nanops.py

http://github.com/wesm/pandas
Python | 1272 lines | 876 code | 118 blank | 278 comment | 127 complexity | 6de5e2e4e161732354e204ce3237d6b9 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. from distutils.version import LooseVersion
  2. import functools
  3. import itertools
  4. import operator
  5. import warnings
  6. import numpy as np
  7. from pandas._libs import iNaT, lib, tslibs
  8. import pandas.compat as compat
  9. from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
  10. from pandas.core.dtypes.common import (
  11. _get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, is_complex_dtype,
  12. is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype,
  13. is_float, is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype,
  14. is_object_dtype, is_scalar, is_timedelta64_dtype, pandas_dtype)
  15. from pandas.core.dtypes.dtypes import DatetimeTZDtype
  16. from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
  17. import pandas.core.common as com
  18. from pandas.core.config import get_option
  19. _BOTTLENECK_INSTALLED = False
  20. _MIN_BOTTLENECK_VERSION = '1.0.0'
  21. try:
  22. import bottleneck as bn
  23. ver = bn.__version__
  24. _BOTTLENECK_INSTALLED = (LooseVersion(ver) >=
  25. LooseVersion(_MIN_BOTTLENECK_VERSION))
  26. if not _BOTTLENECK_INSTALLED:
  27. warnings.warn(
  28. "The installed version of bottleneck {ver} is not supported "
  29. "in pandas and will be not be used\nThe minimum supported "
  30. "version is {min_ver}\n".format(
  31. ver=ver, min_ver=_MIN_BOTTLENECK_VERSION), UserWarning)
  32. except ImportError: # pragma: no cover
  33. pass
  34. _USE_BOTTLENECK = False
  35. def set_use_bottleneck(v=True):
  36. # set/unset to use bottleneck
  37. global _USE_BOTTLENECK
  38. if _BOTTLENECK_INSTALLED:
  39. _USE_BOTTLENECK = v
  40. set_use_bottleneck(get_option('compute.use_bottleneck'))
  41. class disallow(object):
  42. def __init__(self, *dtypes):
  43. super(disallow, self).__init__()
  44. self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
  45. def check(self, obj):
  46. return hasattr(obj, 'dtype') and issubclass(obj.dtype.type,
  47. self.dtypes)
  48. def __call__(self, f):
  49. @functools.wraps(f)
  50. def _f(*args, **kwargs):
  51. obj_iter = itertools.chain(args, compat.itervalues(kwargs))
  52. if any(self.check(obj) for obj in obj_iter):
  53. msg = 'reduction operation {name!r} not allowed for this dtype'
  54. raise TypeError(msg.format(name=f.__name__.replace('nan', '')))
  55. try:
  56. with np.errstate(invalid='ignore'):
  57. return f(*args, **kwargs)
  58. except ValueError as e:
  59. # we want to transform an object array
  60. # ValueError message to the more typical TypeError
  61. # e.g. this is normally a disallowed function on
  62. # object arrays that contain strings
  63. if is_object_dtype(args[0]):
  64. raise TypeError(e)
  65. raise
  66. return _f
  67. class bottleneck_switch(object):
  68. def __init__(self, **kwargs):
  69. self.kwargs = kwargs
  70. def __call__(self, alt):
  71. bn_name = alt.__name__
  72. try:
  73. bn_func = getattr(bn, bn_name)
  74. except (AttributeError, NameError): # pragma: no cover
  75. bn_func = None
  76. @functools.wraps(alt)
  77. def f(values, axis=None, skipna=True, **kwds):
  78. if len(self.kwargs) > 0:
  79. for k, v in compat.iteritems(self.kwargs):
  80. if k not in kwds:
  81. kwds[k] = v
  82. try:
  83. if values.size == 0 and kwds.get('min_count') is None:
  84. # We are empty, returning NA for our type
  85. # Only applies for the default `min_count` of None
  86. # since that affects how empty arrays are handled.
  87. # TODO(GH-18976) update all the nanops methods to
  88. # correctly handle empty inputs and remove this check.
  89. # It *may* just be `var`
  90. return _na_for_min_count(values, axis)
  91. if (_USE_BOTTLENECK and skipna and
  92. _bn_ok_dtype(values.dtype, bn_name)):
  93. result = bn_func(values, axis=axis, **kwds)
  94. # prefer to treat inf/-inf as NA, but must compute the func
  95. # twice :(
  96. if _has_infs(result):
  97. result = alt(values, axis=axis, skipna=skipna, **kwds)
  98. else:
  99. result = alt(values, axis=axis, skipna=skipna, **kwds)
  100. except Exception:
  101. try:
  102. result = alt(values, axis=axis, skipna=skipna, **kwds)
  103. except ValueError as e:
  104. # we want to transform an object array
  105. # ValueError message to the more typical TypeError
  106. # e.g. this is normally a disallowed function on
  107. # object arrays that contain strings
  108. if is_object_dtype(values):
  109. raise TypeError(e)
  110. raise
  111. return result
  112. return f
  113. def _bn_ok_dtype(dt, name):
  114. # Bottleneck chokes on datetime64
  115. if (not is_object_dtype(dt) and
  116. not (is_datetime_or_timedelta_dtype(dt) or
  117. is_datetime64tz_dtype(dt))):
  118. # GH 15507
  119. # bottleneck does not properly upcast during the sum
  120. # so can overflow
  121. # GH 9422
  122. # further we also want to preserve NaN when all elements
  123. # are NaN, unlinke bottleneck/numpy which consider this
  124. # to be 0
  125. if name in ['nansum', 'nanprod']:
  126. return False
  127. return True
  128. return False
  129. def _has_infs(result):
  130. if isinstance(result, np.ndarray):
  131. if result.dtype == 'f8':
  132. return lib.has_infs_f8(result.ravel())
  133. elif result.dtype == 'f4':
  134. return lib.has_infs_f4(result.ravel())
  135. try:
  136. return np.isinf(result).any()
  137. except (TypeError, NotImplementedError):
  138. # if it doesn't support infs, then it can't have infs
  139. return False
  140. def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
  141. """ return the correct fill value for the dtype of the values """
  142. if fill_value is not None:
  143. return fill_value
  144. if _na_ok_dtype(dtype):
  145. if fill_value_typ is None:
  146. return np.nan
  147. else:
  148. if fill_value_typ == '+inf':
  149. return np.inf
  150. else:
  151. return -np.inf
  152. else:
  153. if fill_value_typ is None:
  154. return tslibs.iNaT
  155. else:
  156. if fill_value_typ == '+inf':
  157. # need the max int here
  158. return _int64_max
  159. else:
  160. return tslibs.iNaT
  161. def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
  162. isfinite=False, copy=True, mask=None):
  163. """ utility to get the values view, mask, dtype
  164. if necessary copy and mask using the specified fill_value
  165. copy = True will force the copy
  166. """
  167. if is_datetime64tz_dtype(values):
  168. # com.values_from_object returns M8[ns] dtype instead of tz-aware,
  169. # so this case must be handled separately from the rest
  170. dtype = values.dtype
  171. values = getattr(values, "_values", values)
  172. else:
  173. values = com.values_from_object(values)
  174. dtype = values.dtype
  175. if mask is None:
  176. if isfinite:
  177. mask = _isfinite(values)
  178. else:
  179. mask = isna(values)
  180. if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values):
  181. # changing timedelta64/datetime64 to int64 needs to happen after
  182. # finding `mask` above
  183. values = getattr(values, "asi8", values)
  184. values = values.view(np.int64)
  185. dtype_ok = _na_ok_dtype(dtype)
  186. # get our fill value (in case we need to provide an alternative
  187. # dtype for it)
  188. fill_value = _get_fill_value(dtype, fill_value=fill_value,
  189. fill_value_typ=fill_value_typ)
  190. if skipna:
  191. if copy:
  192. values = values.copy()
  193. if dtype_ok:
  194. np.putmask(values, mask, fill_value)
  195. # promote if needed
  196. else:
  197. values, changed = maybe_upcast_putmask(values, mask, fill_value)
  198. elif copy:
  199. values = values.copy()
  200. # return a platform independent precision dtype
  201. dtype_max = dtype
  202. if is_integer_dtype(dtype) or is_bool_dtype(dtype):
  203. dtype_max = np.int64
  204. elif is_float_dtype(dtype):
  205. dtype_max = np.float64
  206. return values, mask, dtype, dtype_max, fill_value
  207. def _isfinite(values):
  208. if is_datetime_or_timedelta_dtype(values):
  209. return isna(values)
  210. if (is_complex_dtype(values) or is_float_dtype(values) or
  211. is_integer_dtype(values) or is_bool_dtype(values)):
  212. return ~np.isfinite(values)
  213. return ~np.isfinite(values.astype('float64'))
  214. def _na_ok_dtype(dtype):
  215. # TODO: what about datetime64tz? PeriodDtype?
  216. return not issubclass(dtype.type,
  217. (np.integer, np.timedelta64, np.datetime64))
  218. def _wrap_results(result, dtype, fill_value=None):
  219. """ wrap our results if needed """
  220. if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
  221. if fill_value is None:
  222. # GH#24293
  223. fill_value = iNaT
  224. if not isinstance(result, np.ndarray):
  225. tz = getattr(dtype, 'tz', None)
  226. assert not isna(fill_value), "Expected non-null fill_value"
  227. if result == fill_value:
  228. result = np.nan
  229. result = tslibs.Timestamp(result, tz=tz)
  230. else:
  231. result = result.view(dtype)
  232. elif is_timedelta64_dtype(dtype):
  233. if not isinstance(result, np.ndarray):
  234. if result == fill_value:
  235. result = np.nan
  236. # raise if we have a timedelta64[ns] which is too large
  237. if np.fabs(result) > _int64_max:
  238. raise ValueError("overflow in timedelta operation")
  239. result = tslibs.Timedelta(result, unit='ns')
  240. else:
  241. result = result.astype('i8').view(dtype)
  242. return result
  243. def _na_for_min_count(values, axis):
  244. """Return the missing value for `values`
  245. Parameters
  246. ----------
  247. values : ndarray
  248. axis : int or None
  249. axis for the reduction
  250. Returns
  251. -------
  252. result : scalar or ndarray
  253. For 1-D values, returns a scalar of the correct missing type.
  254. For 2-D values, returns a 1-D array where each element is missing.
  255. """
  256. # we either return np.nan or pd.NaT
  257. if is_numeric_dtype(values):
  258. values = values.astype('float64')
  259. fill_value = na_value_for_dtype(values.dtype)
  260. if values.ndim == 1:
  261. return fill_value
  262. else:
  263. result_shape = (values.shape[:axis] +
  264. values.shape[axis + 1:])
  265. result = np.empty(result_shape, dtype=values.dtype)
  266. result.fill(fill_value)
  267. return result
  268. def nanany(values, axis=None, skipna=True, mask=None):
  269. """
  270. Check if any elements along an axis evaluate to True.
  271. Parameters
  272. ----------
  273. values : ndarray
  274. axis : int, optional
  275. skipna : bool, default True
  276. mask : ndarray[bool], optional
  277. nan-mask if known
  278. Returns
  279. -------
  280. result : bool
  281. Examples
  282. --------
  283. >>> import pandas.core.nanops as nanops
  284. >>> s = pd.Series([1, 2])
  285. >>> nanops.nanany(s)
  286. True
  287. >>> import pandas.core.nanops as nanops
  288. >>> s = pd.Series([np.nan])
  289. >>> nanops.nanany(s)
  290. False
  291. """
  292. values, mask, dtype, _, _ = _get_values(values, skipna, False, copy=skipna,
  293. mask=mask)
  294. return values.any(axis)
  295. def nanall(values, axis=None, skipna=True, mask=None):
  296. """
  297. Check if all elements along an axis evaluate to True.
  298. Parameters
  299. ----------
  300. values : ndarray
  301. axis: int, optional
  302. skipna : bool, default True
  303. mask : ndarray[bool], optional
  304. nan-mask if known
  305. Returns
  306. -------
  307. result : bool
  308. Examples
  309. --------
  310. >>> import pandas.core.nanops as nanops
  311. >>> s = pd.Series([1, 2, np.nan])
  312. >>> nanops.nanall(s)
  313. True
  314. >>> import pandas.core.nanops as nanops
  315. >>> s = pd.Series([1, 0])
  316. >>> nanops.nanall(s)
  317. False
  318. """
  319. values, mask, dtype, _, _ = _get_values(values, skipna, True, copy=skipna,
  320. mask=mask)
  321. return values.all(axis)
  322. @disallow('M8')
  323. def nansum(values, axis=None, skipna=True, min_count=0, mask=None):
  324. """
  325. Sum the elements along an axis ignoring NaNs
  326. Parameters
  327. ----------
  328. values : ndarray[dtype]
  329. axis: int, optional
  330. skipna : bool, default True
  331. min_count: int, default 0
  332. mask : ndarray[bool], optional
  333. nan-mask if known
  334. Returns
  335. -------
  336. result : dtype
  337. Examples
  338. --------
  339. >>> import pandas.core.nanops as nanops
  340. >>> s = pd.Series([1, 2, np.nan])
  341. >>> nanops.nansum(s)
  342. 3.0
  343. """
  344. values, mask, dtype, dtype_max, _ = _get_values(values,
  345. skipna, 0, mask=mask)
  346. dtype_sum = dtype_max
  347. if is_float_dtype(dtype):
  348. dtype_sum = dtype
  349. elif is_timedelta64_dtype(dtype):
  350. dtype_sum = np.float64
  351. the_sum = values.sum(axis, dtype=dtype_sum)
  352. the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count)
  353. return _wrap_results(the_sum, dtype)
  354. @disallow('M8', DatetimeTZDtype)
  355. @bottleneck_switch()
  356. def nanmean(values, axis=None, skipna=True, mask=None):
  357. """
  358. Compute the mean of the element along an axis ignoring NaNs
  359. Parameters
  360. ----------
  361. values : ndarray
  362. axis: int, optional
  363. skipna : bool, default True
  364. mask : ndarray[bool], optional
  365. nan-mask if known
  366. Returns
  367. -------
  368. result : float
  369. Unless input is a float array, in which case use the same
  370. precision as the input array.
  371. Examples
  372. --------
  373. >>> import pandas.core.nanops as nanops
  374. >>> s = pd.Series([1, 2, np.nan])
  375. >>> nanops.nanmean(s)
  376. 1.5
  377. """
  378. values, mask, dtype, dtype_max, _ = _get_values(
  379. values, skipna, 0, mask=mask)
  380. dtype_sum = dtype_max
  381. dtype_count = np.float64
  382. if (is_integer_dtype(dtype) or is_timedelta64_dtype(dtype) or
  383. is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)):
  384. dtype_sum = np.float64
  385. elif is_float_dtype(dtype):
  386. dtype_sum = dtype
  387. dtype_count = dtype
  388. count = _get_counts(mask, axis, dtype=dtype_count)
  389. the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
  390. if axis is not None and getattr(the_sum, 'ndim', False):
  391. with np.errstate(all="ignore"):
  392. # suppress division by zero warnings
  393. the_mean = the_sum / count
  394. ct_mask = count == 0
  395. if ct_mask.any():
  396. the_mean[ct_mask] = np.nan
  397. else:
  398. the_mean = the_sum / count if count > 0 else np.nan
  399. return _wrap_results(the_mean, dtype)
  400. @disallow('M8')
  401. @bottleneck_switch()
  402. def nanmedian(values, axis=None, skipna=True, mask=None):
  403. """
  404. Parameters
  405. ----------
  406. values : ndarray
  407. axis: int, optional
  408. skipna : bool, default True
  409. mask : ndarray[bool], optional
  410. nan-mask if known
  411. Returns
  412. -------
  413. result : float
  414. Unless input is a float array, in which case use the same
  415. precision as the input array.
  416. Examples
  417. --------
  418. >>> import pandas.core.nanops as nanops
  419. >>> s = pd.Series([1, np.nan, 2, 2])
  420. >>> nanops.nanmedian(s)
  421. 2.0
  422. """
  423. def get_median(x):
  424. mask = notna(x)
  425. if not skipna and not mask.all():
  426. return np.nan
  427. return np.nanmedian(x[mask])
  428. values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask)
  429. if not is_float_dtype(values):
  430. values = values.astype('f8')
  431. values[mask] = np.nan
  432. if axis is None:
  433. values = values.ravel()
  434. notempty = values.size
  435. # an array from a frame
  436. if values.ndim > 1:
  437. # there's a non-empty array to apply over otherwise numpy raises
  438. if notempty:
  439. if not skipna:
  440. return _wrap_results(
  441. np.apply_along_axis(get_median, axis, values), dtype)
  442. # fastpath for the skipna case
  443. return _wrap_results(np.nanmedian(values, axis), dtype)
  444. # must return the correct shape, but median is not defined for the
  445. # empty set so return nans of shape "everything but the passed axis"
  446. # since "axis" is where the reduction would occur if we had a nonempty
  447. # array
  448. shp = np.array(values.shape)
  449. dims = np.arange(values.ndim)
  450. ret = np.empty(shp[dims != axis])
  451. ret.fill(np.nan)
  452. return _wrap_results(ret, dtype)
  453. # otherwise return a scalar value
  454. return _wrap_results(get_median(values) if notempty else np.nan, dtype)
  455. def _get_counts_nanvar(mask, axis, ddof, dtype=float):
  456. dtype = _get_dtype(dtype)
  457. count = _get_counts(mask, axis, dtype=dtype)
  458. d = count - dtype.type(ddof)
  459. # always return NaN, never inf
  460. if is_scalar(count):
  461. if count <= ddof:
  462. count = np.nan
  463. d = np.nan
  464. else:
  465. mask2 = count <= ddof
  466. if mask2.any():
  467. np.putmask(d, mask2, np.nan)
  468. np.putmask(count, mask2, np.nan)
  469. return count, d
  470. @disallow('M8')
  471. @bottleneck_switch(ddof=1)
  472. def nanstd(values, axis=None, skipna=True, ddof=1, mask=None):
  473. """
  474. Compute the standard deviation along given axis while ignoring NaNs
  475. Parameters
  476. ----------
  477. values : ndarray
  478. axis: int, optional
  479. skipna : bool, default True
  480. ddof : int, default 1
  481. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  482. where N represents the number of elements.
  483. mask : ndarray[bool], optional
  484. nan-mask if known
  485. Returns
  486. -------
  487. result : float
  488. Unless input is a float array, in which case use the same
  489. precision as the input array.
  490. Examples
  491. --------
  492. >>> import pandas.core.nanops as nanops
  493. >>> s = pd.Series([1, np.nan, 2, 3])
  494. >>> nanops.nanstd(s)
  495. 1.0
  496. """
  497. result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof,
  498. mask=mask))
  499. return _wrap_results(result, values.dtype)
  500. @disallow('M8')
  501. @bottleneck_switch(ddof=1)
  502. def nanvar(values, axis=None, skipna=True, ddof=1, mask=None):
  503. """
  504. Compute the variance along given axis while ignoring NaNs
  505. Parameters
  506. ----------
  507. values : ndarray
  508. axis: int, optional
  509. skipna : bool, default True
  510. ddof : int, default 1
  511. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  512. where N represents the number of elements.
  513. mask : ndarray[bool], optional
  514. nan-mask if known
  515. Returns
  516. -------
  517. result : float
  518. Unless input is a float array, in which case use the same
  519. precision as the input array.
  520. Examples
  521. --------
  522. >>> import pandas.core.nanops as nanops
  523. >>> s = pd.Series([1, np.nan, 2, 3])
  524. >>> nanops.nanvar(s)
  525. 1.0
  526. """
  527. values = com.values_from_object(values)
  528. dtype = values.dtype
  529. if mask is None:
  530. mask = isna(values)
  531. if is_any_int_dtype(values):
  532. values = values.astype('f8')
  533. values[mask] = np.nan
  534. if is_float_dtype(values):
  535. count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)
  536. else:
  537. count, d = _get_counts_nanvar(mask, axis, ddof)
  538. if skipna:
  539. values = values.copy()
  540. np.putmask(values, mask, 0)
  541. # xref GH10242
  542. # Compute variance via two-pass algorithm, which is stable against
  543. # cancellation errors and relatively accurate for small numbers of
  544. # observations.
  545. #
  546. # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
  547. avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
  548. if axis is not None:
  549. avg = np.expand_dims(avg, axis)
  550. sqr = _ensure_numeric((avg - values) ** 2)
  551. np.putmask(sqr, mask, 0)
  552. result = sqr.sum(axis=axis, dtype=np.float64) / d
  553. # Return variance as np.float64 (the datatype used in the accumulator),
  554. # unless we were dealing with a float array, in which case use the same
  555. # precision as the original values array.
  556. if is_float_dtype(dtype):
  557. result = result.astype(dtype)
  558. return _wrap_results(result, values.dtype)
  559. @disallow('M8', 'm8')
  560. def nansem(values, axis=None, skipna=True, ddof=1, mask=None):
  561. """
  562. Compute the standard error in the mean along given axis while ignoring NaNs
  563. Parameters
  564. ----------
  565. values : ndarray
  566. axis: int, optional
  567. skipna : bool, default True
  568. ddof : int, default 1
  569. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  570. where N represents the number of elements.
  571. mask : ndarray[bool], optional
  572. nan-mask if known
  573. Returns
  574. -------
  575. result : float64
  576. Unless input is a float array, in which case use the same
  577. precision as the input array.
  578. Examples
  579. --------
  580. >>> import pandas.core.nanops as nanops
  581. >>> s = pd.Series([1, np.nan, 2, 3])
  582. >>> nanops.nansem(s)
  583. 0.5773502691896258
  584. """
  585. # This checks if non-numeric-like data is passed with numeric_only=False
  586. # and raises a TypeError otherwise
  587. nanvar(values, axis, skipna, ddof=ddof, mask=mask)
  588. if mask is None:
  589. mask = isna(values)
  590. if not is_float_dtype(values.dtype):
  591. values = values.astype('f8')
  592. count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)
  593. var = nanvar(values, axis, skipna, ddof=ddof)
  594. return np.sqrt(var) / np.sqrt(count)
  595. def _nanminmax(meth, fill_value_typ):
  596. @bottleneck_switch()
  597. def reduction(values, axis=None, skipna=True, mask=None):
  598. values, mask, dtype, dtype_max, fill_value = _get_values(
  599. values, skipna, fill_value_typ=fill_value_typ, mask=mask)
  600. if ((axis is not None and values.shape[axis] == 0) or
  601. values.size == 0):
  602. try:
  603. result = getattr(values, meth)(axis, dtype=dtype_max)
  604. result.fill(np.nan)
  605. except (AttributeError, TypeError,
  606. ValueError, np.core._internal.AxisError):
  607. result = np.nan
  608. else:
  609. result = getattr(values, meth)(axis)
  610. result = _wrap_results(result, dtype, fill_value)
  611. return _maybe_null_out(result, axis, mask)
  612. reduction.__name__ = 'nan' + meth
  613. return reduction
  614. nanmin = _nanminmax('min', fill_value_typ='+inf')
  615. nanmax = _nanminmax('max', fill_value_typ='-inf')
  616. @disallow('O')
  617. def nanargmax(values, axis=None, skipna=True, mask=None):
  618. """
  619. Parameters
  620. ----------
  621. values : ndarray
  622. axis: int, optional
  623. skipna : bool, default True
  624. mask : ndarray[bool], optional
  625. nan-mask if known
  626. Returns
  627. --------
  628. result : int
  629. The index of max value in specified axis or -1 in the NA case
  630. Examples
  631. --------
  632. >>> import pandas.core.nanops as nanops
  633. >>> s = pd.Series([1, 2, 3, np.nan, 4])
  634. >>> nanops.nanargmax(s)
  635. 4
  636. """
  637. values, mask, dtype, _, _ = _get_values(
  638. values, skipna, fill_value_typ='-inf', mask=mask)
  639. result = values.argmax(axis)
  640. result = _maybe_arg_null_out(result, axis, mask, skipna)
  641. return result
  642. @disallow('O')
  643. def nanargmin(values, axis=None, skipna=True, mask=None):
  644. """
  645. Parameters
  646. ----------
  647. values : ndarray
  648. axis: int, optional
  649. skipna : bool, default True
  650. mask : ndarray[bool], optional
  651. nan-mask if known
  652. Returns
  653. --------
  654. result : int
  655. The index of min value in specified axis or -1 in the NA case
  656. Examples
  657. --------
  658. >>> import pandas.core.nanops as nanops
  659. >>> s = pd.Series([1, 2, 3, np.nan, 4])
  660. >>> nanops.nanargmin(s)
  661. 0
  662. """
  663. values, mask, dtype, _, _ = _get_values(
  664. values, skipna, fill_value_typ='+inf', mask=mask)
  665. result = values.argmin(axis)
  666. result = _maybe_arg_null_out(result, axis, mask, skipna)
  667. return result
  668. @disallow('M8', 'm8')
  669. def nanskew(values, axis=None, skipna=True, mask=None):
  670. """ Compute the sample skewness.
  671. The statistic computed here is the adjusted Fisher-Pearson standardized
  672. moment coefficient G1. The algorithm computes this coefficient directly
  673. from the second and third central moment.
  674. Parameters
  675. ----------
  676. values : ndarray
  677. axis: int, optional
  678. skipna : bool, default True
  679. mask : ndarray[bool], optional
  680. nan-mask if known
  681. Returns
  682. -------
  683. result : float64
  684. Unless input is a float array, in which case use the same
  685. precision as the input array.
  686. Examples
  687. --------
  688. >>> import pandas.core.nanops as nanops
  689. >>> s = pd.Series([1,np.nan, 1, 2])
  690. >>> nanops.nanskew(s)
  691. 1.7320508075688787
  692. """
  693. values = com.values_from_object(values)
  694. if mask is None:
  695. mask = isna(values)
  696. if not is_float_dtype(values.dtype):
  697. values = values.astype('f8')
  698. count = _get_counts(mask, axis)
  699. else:
  700. count = _get_counts(mask, axis, dtype=values.dtype)
  701. if skipna:
  702. values = values.copy()
  703. np.putmask(values, mask, 0)
  704. mean = values.sum(axis, dtype=np.float64) / count
  705. if axis is not None:
  706. mean = np.expand_dims(mean, axis)
  707. adjusted = values - mean
  708. if skipna:
  709. np.putmask(adjusted, mask, 0)
  710. adjusted2 = adjusted ** 2
  711. adjusted3 = adjusted2 * adjusted
  712. m2 = adjusted2.sum(axis, dtype=np.float64)
  713. m3 = adjusted3.sum(axis, dtype=np.float64)
  714. # floating point error
  715. #
  716. # #18044 in _libs/windows.pyx calc_skew follow this behavior
  717. # to fix the fperr to treat m2 <1e-14 as zero
  718. m2 = _zero_out_fperr(m2)
  719. m3 = _zero_out_fperr(m3)
  720. with np.errstate(invalid='ignore', divide='ignore'):
  721. result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)
  722. dtype = values.dtype
  723. if is_float_dtype(dtype):
  724. result = result.astype(dtype)
  725. if isinstance(result, np.ndarray):
  726. result = np.where(m2 == 0, 0, result)
  727. result[count < 3] = np.nan
  728. return result
  729. else:
  730. result = 0 if m2 == 0 else result
  731. if count < 3:
  732. return np.nan
  733. return result
  734. @disallow('M8', 'm8')
  735. def nankurt(values, axis=None, skipna=True, mask=None):
  736. """
  737. Compute the sample excess kurtosis
  738. The statistic computed here is the adjusted Fisher-Pearson standardized
  739. moment coefficient G2, computed directly from the second and fourth
  740. central moment.
  741. Parameters
  742. ----------
  743. values : ndarray
  744. axis: int, optional
  745. skipna : bool, default True
  746. mask : ndarray[bool], optional
  747. nan-mask if known
  748. Returns
  749. -------
  750. result : float64
  751. Unless input is a float array, in which case use the same
  752. precision as the input array.
  753. Examples
  754. --------
  755. >>> import pandas.core.nanops as nanops
  756. >>> s = pd.Series([1,np.nan, 1, 3, 2])
  757. >>> nanops.nankurt(s)
  758. -1.2892561983471076
  759. """
  760. values = com.values_from_object(values)
  761. if mask is None:
  762. mask = isna(values)
  763. if not is_float_dtype(values.dtype):
  764. values = values.astype('f8')
  765. count = _get_counts(mask, axis)
  766. else:
  767. count = _get_counts(mask, axis, dtype=values.dtype)
  768. if skipna:
  769. values = values.copy()
  770. np.putmask(values, mask, 0)
  771. mean = values.sum(axis, dtype=np.float64) / count
  772. if axis is not None:
  773. mean = np.expand_dims(mean, axis)
  774. adjusted = values - mean
  775. if skipna:
  776. np.putmask(adjusted, mask, 0)
  777. adjusted2 = adjusted ** 2
  778. adjusted4 = adjusted2 ** 2
  779. m2 = adjusted2.sum(axis, dtype=np.float64)
  780. m4 = adjusted4.sum(axis, dtype=np.float64)
  781. with np.errstate(invalid='ignore', divide='ignore'):
  782. adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
  783. numer = count * (count + 1) * (count - 1) * m4
  784. denom = (count - 2) * (count - 3) * m2 ** 2
  785. # floating point error
  786. #
  787. # #18044 in _libs/windows.pyx calc_kurt follow this behavior
  788. # to fix the fperr to treat denom <1e-14 as zero
  789. numer = _zero_out_fperr(numer)
  790. denom = _zero_out_fperr(denom)
  791. if not isinstance(denom, np.ndarray):
  792. # if ``denom`` is a scalar, check these corner cases first before
  793. # doing division
  794. if count < 4:
  795. return np.nan
  796. if denom == 0:
  797. return 0
  798. with np.errstate(invalid='ignore', divide='ignore'):
  799. result = numer / denom - adj
  800. dtype = values.dtype
  801. if is_float_dtype(dtype):
  802. result = result.astype(dtype)
  803. if isinstance(result, np.ndarray):
  804. result = np.where(denom == 0, 0, result)
  805. result[count < 4] = np.nan
  806. return result
  807. @disallow('M8', 'm8')
  808. def nanprod(values, axis=None, skipna=True, min_count=0, mask=None):
  809. """
  810. Parameters
  811. ----------
  812. values : ndarray[dtype]
  813. axis: int, optional
  814. skipna : bool, default True
  815. min_count: int, default 0
  816. mask : ndarray[bool], optional
  817. nan-mask if known
  818. Returns
  819. -------
  820. result : dtype
  821. Examples
  822. --------
  823. >>> import pandas.core.nanops as nanops
  824. >>> s = pd.Series([1, 2, 3, np.nan])
  825. >>> nanops.nanprod(s)
  826. 6.0
  827. Returns
  828. --------
  829. The product of all elements on a given axis. ( NaNs are treated as 1)
  830. """
  831. if mask is None:
  832. mask = isna(values)
  833. if skipna and not is_any_int_dtype(values):
  834. values = values.copy()
  835. values[mask] = 1
  836. result = values.prod(axis)
  837. return _maybe_null_out(result, axis, mask, min_count=min_count)
  838. def _maybe_arg_null_out(result, axis, mask, skipna):
  839. # helper function for nanargmin/nanargmax
  840. if axis is None or not getattr(result, 'ndim', False):
  841. if skipna:
  842. if mask.all():
  843. result = -1
  844. else:
  845. if mask.any():
  846. result = -1
  847. else:
  848. if skipna:
  849. na_mask = mask.all(axis)
  850. else:
  851. na_mask = mask.any(axis)
  852. if na_mask.any():
  853. result[na_mask] = -1
  854. return result
  855. def _get_counts(mask, axis, dtype=float):
  856. dtype = _get_dtype(dtype)
  857. if axis is None:
  858. return dtype.type(mask.size - mask.sum())
  859. count = mask.shape[axis] - mask.sum(axis)
  860. if is_scalar(count):
  861. return dtype.type(count)
  862. try:
  863. return count.astype(dtype)
  864. except AttributeError:
  865. return np.array(count, dtype=dtype)
  866. def _maybe_null_out(result, axis, mask, min_count=1):
  867. if axis is not None and getattr(result, 'ndim', False):
  868. null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
  869. if np.any(null_mask):
  870. if is_numeric_dtype(result):
  871. if np.iscomplexobj(result):
  872. result = result.astype('c16')
  873. else:
  874. result = result.astype('f8')
  875. result[null_mask] = np.nan
  876. else:
  877. # GH12941, use None to auto cast null
  878. result[null_mask] = None
  879. elif result is not tslibs.NaT:
  880. null_mask = mask.size - mask.sum()
  881. if null_mask < min_count:
  882. result = np.nan
  883. return result
  884. def _zero_out_fperr(arg):
  885. # #18044 reference this behavior to fix rolling skew/kurt issue
  886. if isinstance(arg, np.ndarray):
  887. with np.errstate(invalid='ignore'):
  888. return np.where(np.abs(arg) < 1e-14, 0, arg)
  889. else:
  890. return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
  891. @disallow('M8', 'm8')
  892. def nancorr(a, b, method='pearson', min_periods=None):
  893. """
  894. a, b: ndarrays
  895. """
  896. if len(a) != len(b):
  897. raise AssertionError('Operands to nancorr must have same size')
  898. if min_periods is None:
  899. min_periods = 1
  900. valid = notna(a) & notna(b)
  901. if not valid.all():
  902. a = a[valid]
  903. b = b[valid]
  904. if len(a) < min_periods:
  905. return np.nan
  906. f = get_corr_func(method)
  907. return f(a, b)
  908. def get_corr_func(method):
  909. if method in ['kendall', 'spearman']:
  910. from scipy.stats import kendalltau, spearmanr
  911. elif callable(method):
  912. return method
  913. def _pearson(a, b):
  914. return np.corrcoef(a, b)[0, 1]
  915. def _kendall(a, b):
  916. rs = kendalltau(a, b)
  917. if isinstance(rs, tuple):
  918. return rs[0]
  919. return rs
  920. def _spearman(a, b):
  921. return spearmanr(a, b)[0]
  922. _cor_methods = {
  923. 'pearson': _pearson,
  924. 'kendall': _kendall,
  925. 'spearman': _spearman
  926. }
  927. return _cor_methods[method]
  928. @disallow('M8', 'm8')
  929. def nancov(a, b, min_periods=None):
  930. if len(a) != len(b):
  931. raise AssertionError('Operands to nancov must have same size')
  932. if min_periods is None:
  933. min_periods = 1
  934. valid = notna(a) & notna(b)
  935. if not valid.all():
  936. a = a[valid]
  937. b = b[valid]
  938. if len(a) < min_periods:
  939. return np.nan
  940. return np.cov(a, b)[0, 1]
  941. def _ensure_numeric(x):
  942. if isinstance(x, np.ndarray):
  943. if is_integer_dtype(x) or is_bool_dtype(x):
  944. x = x.astype(np.float64)
  945. elif is_object_dtype(x):
  946. try:
  947. x = x.astype(np.complex128)
  948. except (TypeError, ValueError):
  949. x = x.astype(np.float64)
  950. else:
  951. if not np.any(x.imag):
  952. x = x.real
  953. elif not (is_float(x) or is_integer(x) or is_complex(x)):
  954. try:
  955. x = float(x)
  956. except Exception:
  957. try:
  958. x = complex(x)
  959. except Exception:
  960. raise TypeError('Could not convert {value!s} to numeric'
  961. .format(value=x))
  962. return x
  963. # NA-friendly array comparisons
  964. def make_nancomp(op):
  965. def f(x, y):
  966. xmask = isna(x)
  967. ymask = isna(y)
  968. mask = xmask | ymask
  969. with np.errstate(all='ignore'):
  970. result = op(x, y)
  971. if mask.any():
  972. if is_bool_dtype(result):
  973. result = result.astype('O')
  974. np.putmask(result, mask, np.nan)
  975. return result
  976. return f
  977. nangt = make_nancomp(operator.gt)
  978. nange = make_nancomp(operator.ge)
  979. nanlt = make_nancomp(operator.lt)
  980. nanle = make_nancomp(operator.le)
  981. naneq = make_nancomp(operator.eq)
  982. nanne = make_nancomp(operator.ne)
  983. def _nanpercentile_1d(values, mask, q, na_value, interpolation):
  984. """
  985. Wraper for np.percentile that skips missing values, specialized to
  986. 1-dimensional case.
  987. Parameters
  988. ----------
  989. values : array over which to find quantiles
  990. mask : ndarray[bool]
  991. locations in values that should be considered missing
  992. q : scalar or array of quantile indices to find
  993. na_value : scalar
  994. value to return for empty or all-null values
  995. interpolation : str
  996. Returns
  997. -------
  998. quantiles : scalar or array
  999. """
  1000. # mask is Union[ExtensionArray, ndarray]
  1001. values = values[~mask]
  1002. if len(values) == 0:
  1003. if lib.is_scalar(q):
  1004. return na_value
  1005. else:
  1006. return np.array([na_value] * len(q),
  1007. dtype=values.dtype)
  1008. return np.percentile(values, q, interpolation=interpolation)
  1009. def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):
  1010. """
  1011. Wraper for np.percentile that skips missing values.
  1012. Parameters
  1013. ----------
  1014. values : array over which to find quantiles
  1015. q : scalar or array of quantile indices to find
  1016. axis : {0, 1}
  1017. na_value : scalar
  1018. value to return for empty or all-null values
  1019. mask : ndarray[bool]
  1020. locations in values that should be considered missing
  1021. ndim : {1, 2}
  1022. interpolation : str
  1023. Returns
  1024. -------
  1025. quantiles : scalar or array
  1026. """
  1027. if not lib.is_scalar(mask) and mask.any():
  1028. if ndim == 1:
  1029. return _nanpercentile_1d(values, mask, q, na_value,
  1030. interpolation=interpolation)
  1031. else:
  1032. # for nonconsolidatable blocks mask is 1D, but values 2D
  1033. if mask.ndim < values.ndim:
  1034. mask = mask.reshape(values.shape)
  1035. if axis == 0:
  1036. values = values.T
  1037. mask = mask.T
  1038. result = [_nanpercentile_1d(val, m, q, na_value,
  1039. interpolation=interpolation)
  1040. for (val, m) in zip(list(values), list(mask))]
  1041. result = np.array(result, dtype=values.dtype, copy=False).T
  1042. return result
  1043. else:
  1044. return np.percentile(values, q, axis=axis, interpolation=interpolation)