PageRenderTime 52ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/core/nanops.py

http://github.com/pydata/pandas
Python | 1829 lines | 1030 code | 204 blank | 595 comment | 222 complexity | 5300961abfb45f46e243c4d0d8977e63 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. from __future__ import annotations
  2. import functools
  3. import itertools
  4. import operator
  5. from typing import (
  6. Any,
  7. cast,
  8. )
  9. import warnings
  10. import numpy as np
  11. from pandas._config import get_option
  12. from pandas._libs import (
  13. NaT,
  14. NaTType,
  15. Timedelta,
  16. iNaT,
  17. lib,
  18. )
  19. from pandas._typing import (
  20. ArrayLike,
  21. Dtype,
  22. DtypeObj,
  23. F,
  24. Scalar,
  25. Shape,
  26. npt,
  27. )
  28. from pandas.compat._optional import import_optional_dependency
  29. from pandas.core.dtypes.common import (
  30. is_any_int_dtype,
  31. is_bool_dtype,
  32. is_complex,
  33. is_datetime64_any_dtype,
  34. is_float,
  35. is_float_dtype,
  36. is_integer,
  37. is_integer_dtype,
  38. is_numeric_dtype,
  39. is_object_dtype,
  40. is_scalar,
  41. is_timedelta64_dtype,
  42. needs_i8_conversion,
  43. pandas_dtype,
  44. )
  45. from pandas.core.dtypes.dtypes import PeriodDtype
  46. from pandas.core.dtypes.missing import (
  47. isna,
  48. na_value_for_dtype,
  49. notna,
  50. )
  51. from pandas.core.construction import extract_array
  52. bn = import_optional_dependency("bottleneck", errors="warn")
  53. _BOTTLENECK_INSTALLED = bn is not None
  54. _USE_BOTTLENECK = False
  55. def set_use_bottleneck(v: bool = True) -> None:
  56. # set/unset to use bottleneck
  57. global _USE_BOTTLENECK
  58. if _BOTTLENECK_INSTALLED:
  59. _USE_BOTTLENECK = v
  60. set_use_bottleneck(get_option("compute.use_bottleneck"))
  61. class disallow:
  62. def __init__(self, *dtypes: Dtype):
  63. super().__init__()
  64. self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
  65. def check(self, obj) -> bool:
  66. return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes)
  67. def __call__(self, f: F) -> F:
  68. @functools.wraps(f)
  69. def _f(*args, **kwargs):
  70. obj_iter = itertools.chain(args, kwargs.values())
  71. if any(self.check(obj) for obj in obj_iter):
  72. f_name = f.__name__.replace("nan", "")
  73. raise TypeError(
  74. f"reduction operation '{f_name}' not allowed for this dtype"
  75. )
  76. try:
  77. with np.errstate(invalid="ignore"):
  78. return f(*args, **kwargs)
  79. except ValueError as e:
  80. # we want to transform an object array
  81. # ValueError message to the more typical TypeError
  82. # e.g. this is normally a disallowed function on
  83. # object arrays that contain strings
  84. if is_object_dtype(args[0]):
  85. raise TypeError(e) from e
  86. raise
  87. return cast(F, _f)
  88. class bottleneck_switch:
  89. def __init__(self, name=None, **kwargs):
  90. self.name = name
  91. self.kwargs = kwargs
  92. def __call__(self, alt: F) -> F:
  93. bn_name = self.name or alt.__name__
  94. try:
  95. bn_func = getattr(bn, bn_name)
  96. except (AttributeError, NameError): # pragma: no cover
  97. bn_func = None
  98. @functools.wraps(alt)
  99. def f(
  100. values: np.ndarray,
  101. *,
  102. axis: int | None = None,
  103. skipna: bool = True,
  104. **kwds,
  105. ):
  106. if len(self.kwargs) > 0:
  107. for k, v in self.kwargs.items():
  108. if k not in kwds:
  109. kwds[k] = v
  110. if values.size == 0 and kwds.get("min_count") is None:
  111. # We are empty, returning NA for our type
  112. # Only applies for the default `min_count` of None
  113. # since that affects how empty arrays are handled.
  114. # TODO(GH-18976) update all the nanops methods to
  115. # correctly handle empty inputs and remove this check.
  116. # It *may* just be `var`
  117. return _na_for_min_count(values, axis)
  118. if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
  119. if kwds.get("mask", None) is None:
  120. # `mask` is not recognised by bottleneck, would raise
  121. # TypeError if called
  122. kwds.pop("mask", None)
  123. result = bn_func(values, axis=axis, **kwds)
  124. # prefer to treat inf/-inf as NA, but must compute the func
  125. # twice :(
  126. if _has_infs(result):
  127. result = alt(values, axis=axis, skipna=skipna, **kwds)
  128. else:
  129. result = alt(values, axis=axis, skipna=skipna, **kwds)
  130. else:
  131. result = alt(values, axis=axis, skipna=skipna, **kwds)
  132. return result
  133. return cast(F, f)
  134. def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
  135. # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
  136. if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):
  137. # GH 15507
  138. # bottleneck does not properly upcast during the sum
  139. # so can overflow
  140. # GH 9422
  141. # further we also want to preserve NaN when all elements
  142. # are NaN, unlike bottleneck/numpy which consider this
  143. # to be 0
  144. return name not in ["nansum", "nanprod"]
  145. return False
  146. def _has_infs(result) -> bool:
  147. if isinstance(result, np.ndarray):
  148. if result.dtype == "f8" or result.dtype == "f4":
  149. # Note: outside of an nanops-specific test, we always have
  150. # result.ndim == 1, so there is no risk of this ravel making a copy.
  151. return lib.has_infs(result.ravel("K"))
  152. try:
  153. return np.isinf(result).any()
  154. except (TypeError, NotImplementedError):
  155. # if it doesn't support infs, then it can't have infs
  156. return False
  157. def _get_fill_value(
  158. dtype: DtypeObj, fill_value: Scalar | None = None, fill_value_typ=None
  159. ):
  160. """return the correct fill value for the dtype of the values"""
  161. if fill_value is not None:
  162. return fill_value
  163. if _na_ok_dtype(dtype):
  164. if fill_value_typ is None:
  165. return np.nan
  166. else:
  167. if fill_value_typ == "+inf":
  168. return np.inf
  169. else:
  170. return -np.inf
  171. else:
  172. if fill_value_typ == "+inf":
  173. # need the max int here
  174. return lib.i8max
  175. else:
  176. return iNaT
  177. def _maybe_get_mask(
  178. values: np.ndarray, skipna: bool, mask: npt.NDArray[np.bool_] | None
  179. ) -> npt.NDArray[np.bool_] | None:
  180. """
  181. Compute a mask if and only if necessary.
  182. This function will compute a mask iff it is necessary. Otherwise,
  183. return the provided mask (potentially None) when a mask does not need to be
  184. computed.
  185. A mask is never necessary if the values array is of boolean or integer
  186. dtypes, as these are incapable of storing NaNs. If passing a NaN-capable
  187. dtype that is interpretable as either boolean or integer data (eg,
  188. timedelta64), a mask must be provided.
  189. If the skipna parameter is False, a new mask will not be computed.
  190. The mask is computed using isna() by default. Setting invert=True selects
  191. notna() as the masking function.
  192. Parameters
  193. ----------
  194. values : ndarray
  195. input array to potentially compute mask for
  196. skipna : bool
  197. boolean for whether NaNs should be skipped
  198. mask : Optional[ndarray]
  199. nan-mask if known
  200. Returns
  201. -------
  202. Optional[np.ndarray[bool]]
  203. """
  204. if mask is None:
  205. if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype):
  206. # Boolean data cannot contain nulls, so signal via mask being None
  207. return None
  208. if skipna or needs_i8_conversion(values.dtype):
  209. mask = isna(values)
  210. return mask
  211. def _get_values(
  212. values: np.ndarray,
  213. skipna: bool,
  214. fill_value: Any = None,
  215. fill_value_typ: str | None = None,
  216. mask: npt.NDArray[np.bool_] | None = None,
  217. ) -> tuple[np.ndarray, npt.NDArray[np.bool_] | None, np.dtype, np.dtype, Any]:
  218. """
  219. Utility to get the values view, mask, dtype, dtype_max, and fill_value.
  220. If both mask and fill_value/fill_value_typ are not None and skipna is True,
  221. the values array will be copied.
  222. For input arrays of boolean or integer dtypes, copies will only occur if a
  223. precomputed mask, a fill_value/fill_value_typ, and skipna=True are
  224. provided.
  225. Parameters
  226. ----------
  227. values : ndarray
  228. input array to potentially compute mask for
  229. skipna : bool
  230. boolean for whether NaNs should be skipped
  231. fill_value : Any
  232. value to fill NaNs with
  233. fill_value_typ : str
  234. Set to '+inf' or '-inf' to handle dtype-specific infinities
  235. mask : Optional[np.ndarray[bool]]
  236. nan-mask if known
  237. Returns
  238. -------
  239. values : ndarray
  240. Potential copy of input value array
  241. mask : Optional[ndarray[bool]]
  242. Mask for values, if deemed necessary to compute
  243. dtype : np.dtype
  244. dtype for values
  245. dtype_max : np.dtype
  246. platform independent dtype
  247. fill_value : Any
  248. fill value used
  249. """
  250. # In _get_values is only called from within nanops, and in all cases
  251. # with scalar fill_value. This guarantee is important for the
  252. # np.where call below
  253. assert is_scalar(fill_value)
  254. # error: Incompatible types in assignment (expression has type "Union[Any,
  255. # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
  256. values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
  257. mask = _maybe_get_mask(values, skipna, mask)
  258. dtype = values.dtype
  259. datetimelike = False
  260. if needs_i8_conversion(values.dtype):
  261. # changing timedelta64/datetime64 to int64 needs to happen after
  262. # finding `mask` above
  263. values = np.asarray(values.view("i8"))
  264. datetimelike = True
  265. dtype_ok = _na_ok_dtype(dtype)
  266. # get our fill value (in case we need to provide an alternative
  267. # dtype for it)
  268. fill_value = _get_fill_value(
  269. dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
  270. )
  271. if skipna and (mask is not None) and (fill_value is not None):
  272. if mask.any():
  273. if dtype_ok or datetimelike:
  274. values = values.copy()
  275. np.putmask(values, mask, fill_value)
  276. else:
  277. # np.where will promote if needed
  278. values = np.where(~mask, values, fill_value)
  279. # return a platform independent precision dtype
  280. dtype_max = dtype
  281. if is_integer_dtype(dtype) or is_bool_dtype(dtype):
  282. dtype_max = np.dtype(np.int64)
  283. elif is_float_dtype(dtype):
  284. dtype_max = np.dtype(np.float64)
  285. return values, mask, dtype, dtype_max, fill_value
  286. def _na_ok_dtype(dtype: DtypeObj) -> bool:
  287. if needs_i8_conversion(dtype):
  288. return False
  289. return not issubclass(dtype.type, np.integer)
  290. def _wrap_results(result, dtype: np.dtype, fill_value=None):
  291. """wrap our results if needed"""
  292. if result is NaT:
  293. pass
  294. elif is_datetime64_any_dtype(dtype):
  295. if fill_value is None:
  296. # GH#24293
  297. fill_value = iNaT
  298. if not isinstance(result, np.ndarray):
  299. assert not isna(fill_value), "Expected non-null fill_value"
  300. if result == fill_value:
  301. result = np.nan
  302. if isna(result):
  303. result = np.datetime64("NaT", "ns")
  304. else:
  305. result = np.int64(result).view("datetime64[ns]")
  306. else:
  307. # If we have float dtype, taking a view will give the wrong result
  308. result = result.astype(dtype)
  309. elif is_timedelta64_dtype(dtype):
  310. if not isinstance(result, np.ndarray):
  311. if result == fill_value:
  312. result = np.nan
  313. # raise if we have a timedelta64[ns] which is too large
  314. if np.fabs(result) > lib.i8max:
  315. raise ValueError("overflow in timedelta operation")
  316. result = Timedelta(result, unit="ns")
  317. else:
  318. result = result.astype("m8[ns]").view(dtype)
  319. return result
  320. def _datetimelike_compat(func: F) -> F:
  321. """
  322. If we have datetime64 or timedelta64 values, ensure we have a correct
  323. mask before calling the wrapped function, then cast back afterwards.
  324. """
  325. @functools.wraps(func)
  326. def new_func(
  327. values: np.ndarray,
  328. *,
  329. axis: int | None = None,
  330. skipna: bool = True,
  331. mask: npt.NDArray[np.bool_] | None = None,
  332. **kwargs,
  333. ):
  334. orig_values = values
  335. datetimelike = values.dtype.kind in ["m", "M"]
  336. if datetimelike and mask is None:
  337. mask = isna(values)
  338. result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
  339. if datetimelike:
  340. result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)
  341. if not skipna:
  342. assert mask is not None # checked above
  343. result = _mask_datetimelike_result(result, axis, mask, orig_values)
  344. return result
  345. return cast(F, new_func)
  346. def _na_for_min_count(values: np.ndarray, axis: int | None) -> Scalar | np.ndarray:
  347. """
  348. Return the missing value for `values`.
  349. Parameters
  350. ----------
  351. values : ndarray
  352. axis : int or None
  353. axis for the reduction, required if values.ndim > 1.
  354. Returns
  355. -------
  356. result : scalar or ndarray
  357. For 1-D values, returns a scalar of the correct missing type.
  358. For 2-D values, returns a 1-D array where each element is missing.
  359. """
  360. # we either return np.nan or pd.NaT
  361. if is_numeric_dtype(values):
  362. values = values.astype("float64")
  363. fill_value = na_value_for_dtype(values.dtype)
  364. if values.ndim == 1:
  365. return fill_value
  366. elif axis is None:
  367. return fill_value
  368. else:
  369. result_shape = values.shape[:axis] + values.shape[axis + 1 :]
  370. return np.full(result_shape, fill_value, dtype=values.dtype)
  371. def maybe_operate_rowwise(func: F) -> F:
  372. """
  373. NumPy operations on C-contiguous ndarrays with axis=1 can be
  374. very slow. Operate row-by-row and concatenate the results.
  375. """
  376. @functools.wraps(func)
  377. def newfunc(values: np.ndarray, *, axis: int | None = None, **kwargs):
  378. if (
  379. axis == 1
  380. and values.ndim == 2
  381. and values.flags["C_CONTIGUOUS"]
  382. and values.dtype != object
  383. and values.dtype != bool
  384. ):
  385. arrs = list(values)
  386. if kwargs.get("mask") is not None:
  387. mask = kwargs.pop("mask")
  388. results = [
  389. func(arrs[i], mask=mask[i], **kwargs) for i in range(len(arrs))
  390. ]
  391. else:
  392. results = [func(x, **kwargs) for x in arrs]
  393. return np.array(results)
  394. return func(values, axis=axis, **kwargs)
  395. return cast(F, newfunc)
  396. def nanany(
  397. values: np.ndarray,
  398. *,
  399. axis: int | None = None,
  400. skipna: bool = True,
  401. mask: npt.NDArray[np.bool_] | None = None,
  402. ) -> bool:
  403. """
  404. Check if any elements along an axis evaluate to True.
  405. Parameters
  406. ----------
  407. values : ndarray
  408. axis : int, optional
  409. skipna : bool, default True
  410. mask : ndarray[bool], optional
  411. nan-mask if known
  412. Returns
  413. -------
  414. result : bool
  415. Examples
  416. --------
  417. >>> import pandas.core.nanops as nanops
  418. >>> s = pd.Series([1, 2])
  419. >>> nanops.nanany(s)
  420. True
  421. >>> import pandas.core.nanops as nanops
  422. >>> s = pd.Series([np.nan])
  423. >>> nanops.nanany(s)
  424. False
  425. """
  426. values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask)
  427. # For object type, any won't necessarily return
  428. # boolean values (numpy/numpy#4352)
  429. if is_object_dtype(values):
  430. values = values.astype(bool)
  431. # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
  432. # "bool")
  433. return values.any(axis) # type: ignore[return-value]
  434. def nanall(
  435. values: np.ndarray,
  436. *,
  437. axis: int | None = None,
  438. skipna: bool = True,
  439. mask: npt.NDArray[np.bool_] | None = None,
  440. ) -> bool:
  441. """
  442. Check if all elements along an axis evaluate to True.
  443. Parameters
  444. ----------
  445. values : ndarray
  446. axis : int, optional
  447. skipna : bool, default True
  448. mask : ndarray[bool], optional
  449. nan-mask if known
  450. Returns
  451. -------
  452. result : bool
  453. Examples
  454. --------
  455. >>> import pandas.core.nanops as nanops
  456. >>> s = pd.Series([1, 2, np.nan])
  457. >>> nanops.nanall(s)
  458. True
  459. >>> import pandas.core.nanops as nanops
  460. >>> s = pd.Series([1, 0])
  461. >>> nanops.nanall(s)
  462. False
  463. """
  464. values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask)
  465. # For object type, all won't necessarily return
  466. # boolean values (numpy/numpy#4352)
  467. if is_object_dtype(values):
  468. values = values.astype(bool)
  469. # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
  470. # "bool")
  471. return values.all(axis) # type: ignore[return-value]
  472. @disallow("M8")
  473. @_datetimelike_compat
  474. @maybe_operate_rowwise
  475. def nansum(
  476. values: np.ndarray,
  477. *,
  478. axis: int | None = None,
  479. skipna: bool = True,
  480. min_count: int = 0,
  481. mask: npt.NDArray[np.bool_] | None = None,
  482. ) -> float:
  483. """
  484. Sum the elements along an axis ignoring NaNs
  485. Parameters
  486. ----------
  487. values : ndarray[dtype]
  488. axis : int, optional
  489. skipna : bool, default True
  490. min_count: int, default 0
  491. mask : ndarray[bool], optional
  492. nan-mask if known
  493. Returns
  494. -------
  495. result : dtype
  496. Examples
  497. --------
  498. >>> import pandas.core.nanops as nanops
  499. >>> s = pd.Series([1, 2, np.nan])
  500. >>> nanops.nansum(s)
  501. 3.0
  502. """
  503. values, mask, dtype, dtype_max, _ = _get_values(
  504. values, skipna, fill_value=0, mask=mask
  505. )
  506. dtype_sum = dtype_max
  507. if is_float_dtype(dtype):
  508. dtype_sum = dtype
  509. elif is_timedelta64_dtype(dtype):
  510. dtype_sum = np.dtype(np.float64)
  511. the_sum = values.sum(axis, dtype=dtype_sum)
  512. the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
  513. return the_sum
  514. def _mask_datetimelike_result(
  515. result: np.ndarray | np.datetime64 | np.timedelta64,
  516. axis: int | None,
  517. mask: npt.NDArray[np.bool_],
  518. orig_values: np.ndarray,
  519. ) -> np.ndarray | np.datetime64 | np.timedelta64 | NaTType:
  520. if isinstance(result, np.ndarray):
  521. # we need to apply the mask
  522. result = result.astype("i8").view(orig_values.dtype)
  523. axis_mask = mask.any(axis=axis)
  524. # error: Unsupported target for indexed assignment ("Union[ndarray[Any, Any],
  525. # datetime64, timedelta64]")
  526. result[axis_mask] = iNaT # type: ignore[index]
  527. else:
  528. if mask.any():
  529. return NaT
  530. return result
  531. @disallow(PeriodDtype)
  532. @bottleneck_switch()
  533. @_datetimelike_compat
  534. def nanmean(
  535. values: np.ndarray,
  536. *,
  537. axis: int | None = None,
  538. skipna: bool = True,
  539. mask: npt.NDArray[np.bool_] | None = None,
  540. ) -> float:
  541. """
  542. Compute the mean of the element along an axis ignoring NaNs
  543. Parameters
  544. ----------
  545. values : ndarray
  546. axis : int, optional
  547. skipna : bool, default True
  548. mask : ndarray[bool], optional
  549. nan-mask if known
  550. Returns
  551. -------
  552. float
  553. Unless input is a float array, in which case use the same
  554. precision as the input array.
  555. Examples
  556. --------
  557. >>> import pandas.core.nanops as nanops
  558. >>> s = pd.Series([1, 2, np.nan])
  559. >>> nanops.nanmean(s)
  560. 1.5
  561. """
  562. values, mask, dtype, dtype_max, _ = _get_values(
  563. values, skipna, fill_value=0, mask=mask
  564. )
  565. dtype_sum = dtype_max
  566. dtype_count = np.dtype(np.float64)
  567. # not using needs_i8_conversion because that includes period
  568. if dtype.kind in ["m", "M"]:
  569. dtype_sum = np.dtype(np.float64)
  570. elif is_integer_dtype(dtype):
  571. dtype_sum = np.dtype(np.float64)
  572. elif is_float_dtype(dtype):
  573. dtype_sum = dtype
  574. dtype_count = dtype
  575. count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
  576. the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
  577. if axis is not None and getattr(the_sum, "ndim", False):
  578. count = cast(np.ndarray, count)
  579. with np.errstate(all="ignore"):
  580. # suppress division by zero warnings
  581. the_mean = the_sum / count
  582. ct_mask = count == 0
  583. if ct_mask.any():
  584. the_mean[ct_mask] = np.nan
  585. else:
  586. the_mean = the_sum / count if count > 0 else np.nan
  587. return the_mean
  588. @bottleneck_switch()
  589. def nanmedian(values, *, axis=None, skipna=True, mask=None):
  590. """
  591. Parameters
  592. ----------
  593. values : ndarray
  594. axis : int, optional
  595. skipna : bool, default True
  596. mask : ndarray[bool], optional
  597. nan-mask if known
  598. Returns
  599. -------
  600. result : float
  601. Unless input is a float array, in which case use the same
  602. precision as the input array.
  603. Examples
  604. --------
  605. >>> import pandas.core.nanops as nanops
  606. >>> s = pd.Series([1, np.nan, 2, 2])
  607. >>> nanops.nanmedian(s)
  608. 2.0
  609. """
  610. def get_median(x):
  611. mask = notna(x)
  612. if not skipna and not mask.all():
  613. return np.nan
  614. with warnings.catch_warnings():
  615. # Suppress RuntimeWarning about All-NaN slice
  616. warnings.filterwarnings("ignore", "All-NaN slice encountered")
  617. res = np.nanmedian(x[mask])
  618. return res
  619. values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask)
  620. if not is_float_dtype(values.dtype):
  621. try:
  622. values = values.astype("f8")
  623. except ValueError as err:
  624. # e.g. "could not convert string to float: 'a'"
  625. raise TypeError(str(err)) from err
  626. if mask is not None:
  627. values[mask] = np.nan
  628. if axis is None:
  629. values = values.ravel("K")
  630. notempty = values.size
  631. # an array from a frame
  632. if values.ndim > 1:
  633. # there's a non-empty array to apply over otherwise numpy raises
  634. if notempty:
  635. if not skipna:
  636. res = np.apply_along_axis(get_median, axis, values)
  637. else:
  638. # fastpath for the skipna case
  639. with warnings.catch_warnings():
  640. # Suppress RuntimeWarning about All-NaN slice
  641. warnings.filterwarnings("ignore", "All-NaN slice encountered")
  642. res = np.nanmedian(values, axis)
  643. else:
  644. # must return the correct shape, but median is not defined for the
  645. # empty set so return nans of shape "everything but the passed axis"
  646. # since "axis" is where the reduction would occur if we had a nonempty
  647. # array
  648. res = get_empty_reduction_result(values.shape, axis, np.float_, np.nan)
  649. else:
  650. # otherwise return a scalar value
  651. res = get_median(values) if notempty else np.nan
  652. return _wrap_results(res, dtype)
  653. def get_empty_reduction_result(
  654. shape: tuple[int, ...],
  655. axis: int,
  656. dtype: np.dtype | type[np.floating],
  657. fill_value: Any,
  658. ) -> np.ndarray:
  659. """
  660. The result from a reduction on an empty ndarray.
  661. Parameters
  662. ----------
  663. shape : Tuple[int]
  664. axis : int
  665. dtype : np.dtype
  666. fill_value : Any
  667. Returns
  668. -------
  669. np.ndarray
  670. """
  671. shp = np.array(shape)
  672. dims = np.arange(len(shape))
  673. ret = np.empty(shp[dims != axis], dtype=dtype)
  674. ret.fill(fill_value)
  675. return ret
  676. def _get_counts_nanvar(
  677. values_shape: Shape,
  678. mask: npt.NDArray[np.bool_] | None,
  679. axis: int | None,
  680. ddof: int,
  681. dtype: np.dtype = np.dtype(np.float64),
  682. ) -> tuple[int | float | np.ndarray, int | float | np.ndarray]:
  683. """
  684. Get the count of non-null values along an axis, accounting
  685. for degrees of freedom.
  686. Parameters
  687. ----------
  688. values_shape : Tuple[int, ...]
  689. shape tuple from values ndarray, used if mask is None
  690. mask : Optional[ndarray[bool]]
  691. locations in values that should be considered missing
  692. axis : Optional[int]
  693. axis to count along
  694. ddof : int
  695. degrees of freedom
  696. dtype : type, optional
  697. type to use for count
  698. Returns
  699. -------
  700. count : int, np.nan or np.ndarray
  701. d : int, np.nan or np.ndarray
  702. """
  703. count = _get_counts(values_shape, mask, axis, dtype=dtype)
  704. d = count - dtype.type(ddof)
  705. # always return NaN, never inf
  706. if is_scalar(count):
  707. if count <= ddof:
  708. count = np.nan
  709. d = np.nan
  710. else:
  711. # count is not narrowed by is_scalar check
  712. count = cast(np.ndarray, count)
  713. mask = count <= ddof
  714. if mask.any():
  715. np.putmask(d, mask, np.nan)
  716. np.putmask(count, mask, np.nan)
  717. return count, d
  718. @bottleneck_switch(ddof=1)
  719. def nanstd(values, *, axis=None, skipna=True, ddof=1, mask=None):
  720. """
  721. Compute the standard deviation along given axis while ignoring NaNs
  722. Parameters
  723. ----------
  724. values : ndarray
  725. axis : int, optional
  726. skipna : bool, default True
  727. ddof : int, default 1
  728. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  729. where N represents the number of elements.
  730. mask : ndarray[bool], optional
  731. nan-mask if known
  732. Returns
  733. -------
  734. result : float
  735. Unless input is a float array, in which case use the same
  736. precision as the input array.
  737. Examples
  738. --------
  739. >>> import pandas.core.nanops as nanops
  740. >>> s = pd.Series([1, np.nan, 2, 3])
  741. >>> nanops.nanstd(s)
  742. 1.0
  743. """
  744. if values.dtype == "M8[ns]":
  745. values = values.view("m8[ns]")
  746. orig_dtype = values.dtype
  747. values, mask, _, _, _ = _get_values(values, skipna, mask=mask)
  748. result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
  749. return _wrap_results(result, orig_dtype)
  750. @disallow("M8", "m8")
  751. @bottleneck_switch(ddof=1)
  752. def nanvar(values, *, axis=None, skipna=True, ddof=1, mask=None):
  753. """
  754. Compute the variance along given axis while ignoring NaNs
  755. Parameters
  756. ----------
  757. values : ndarray
  758. axis : int, optional
  759. skipna : bool, default True
  760. ddof : int, default 1
  761. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  762. where N represents the number of elements.
  763. mask : ndarray[bool], optional
  764. nan-mask if known
  765. Returns
  766. -------
  767. result : float
  768. Unless input is a float array, in which case use the same
  769. precision as the input array.
  770. Examples
  771. --------
  772. >>> import pandas.core.nanops as nanops
  773. >>> s = pd.Series([1, np.nan, 2, 3])
  774. >>> nanops.nanvar(s)
  775. 1.0
  776. """
  777. values = extract_array(values, extract_numpy=True)
  778. dtype = values.dtype
  779. mask = _maybe_get_mask(values, skipna, mask)
  780. if is_any_int_dtype(dtype):
  781. values = values.astype("f8")
  782. if mask is not None:
  783. values[mask] = np.nan
  784. if is_float_dtype(values.dtype):
  785. count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
  786. else:
  787. count, d = _get_counts_nanvar(values.shape, mask, axis, ddof)
  788. if skipna and mask is not None:
  789. values = values.copy()
  790. np.putmask(values, mask, 0)
  791. # xref GH10242
  792. # Compute variance via two-pass algorithm, which is stable against
  793. # cancellation errors and relatively accurate for small numbers of
  794. # observations.
  795. #
  796. # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
  797. avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
  798. if axis is not None:
  799. avg = np.expand_dims(avg, axis)
  800. sqr = _ensure_numeric((avg - values) ** 2)
  801. if mask is not None:
  802. np.putmask(sqr, mask, 0)
  803. result = sqr.sum(axis=axis, dtype=np.float64) / d
  804. # Return variance as np.float64 (the datatype used in the accumulator),
  805. # unless we were dealing with a float array, in which case use the same
  806. # precision as the original values array.
  807. if is_float_dtype(dtype):
  808. result = result.astype(dtype, copy=False)
  809. return result
  810. @disallow("M8", "m8")
  811. def nansem(
  812. values: np.ndarray,
  813. *,
  814. axis: int | None = None,
  815. skipna: bool = True,
  816. ddof: int = 1,
  817. mask: npt.NDArray[np.bool_] | None = None,
  818. ) -> float:
  819. """
  820. Compute the standard error in the mean along given axis while ignoring NaNs
  821. Parameters
  822. ----------
  823. values : ndarray
  824. axis : int, optional
  825. skipna : bool, default True
  826. ddof : int, default 1
  827. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  828. where N represents the number of elements.
  829. mask : ndarray[bool], optional
  830. nan-mask if known
  831. Returns
  832. -------
  833. result : float64
  834. Unless input is a float array, in which case use the same
  835. precision as the input array.
  836. Examples
  837. --------
  838. >>> import pandas.core.nanops as nanops
  839. >>> s = pd.Series([1, np.nan, 2, 3])
  840. >>> nanops.nansem(s)
  841. 0.5773502691896258
  842. """
  843. # This checks if non-numeric-like data is passed with numeric_only=False
  844. # and raises a TypeError otherwise
  845. nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
  846. mask = _maybe_get_mask(values, skipna, mask)
  847. if not is_float_dtype(values.dtype):
  848. values = values.astype("f8")
  849. count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
  850. var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof)
  851. return np.sqrt(var) / np.sqrt(count)
  852. def _nanminmax(meth, fill_value_typ):
  853. @bottleneck_switch(name="nan" + meth)
  854. @_datetimelike_compat
  855. def reduction(
  856. values: np.ndarray,
  857. *,
  858. axis: int | None = None,
  859. skipna: bool = True,
  860. mask: npt.NDArray[np.bool_] | None = None,
  861. ) -> Dtype:
  862. values, mask, dtype, dtype_max, fill_value = _get_values(
  863. values, skipna, fill_value_typ=fill_value_typ, mask=mask
  864. )
  865. if (axis is not None and values.shape[axis] == 0) or values.size == 0:
  866. try:
  867. result = getattr(values, meth)(axis, dtype=dtype_max)
  868. result.fill(np.nan)
  869. except (AttributeError, TypeError, ValueError):
  870. result = np.nan
  871. else:
  872. result = getattr(values, meth)(axis)
  873. result = _maybe_null_out(result, axis, mask, values.shape)
  874. return result
  875. return reduction
  876. nanmin = _nanminmax("min", fill_value_typ="+inf")
  877. nanmax = _nanminmax("max", fill_value_typ="-inf")
  878. @disallow("O")
  879. def nanargmax(
  880. values: np.ndarray,
  881. *,
  882. axis: int | None = None,
  883. skipna: bool = True,
  884. mask: npt.NDArray[np.bool_] | None = None,
  885. ) -> int | np.ndarray:
  886. """
  887. Parameters
  888. ----------
  889. values : ndarray
  890. axis : int, optional
  891. skipna : bool, default True
  892. mask : ndarray[bool], optional
  893. nan-mask if known
  894. Returns
  895. -------
  896. result : int or ndarray[int]
  897. The index/indices of max value in specified axis or -1 in the NA case
  898. Examples
  899. --------
  900. >>> import pandas.core.nanops as nanops
  901. >>> arr = np.array([1, 2, 3, np.nan, 4])
  902. >>> nanops.nanargmax(arr)
  903. 4
  904. >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
  905. >>> arr[2:, 2] = np.nan
  906. >>> arr
  907. array([[ 0., 1., 2.],
  908. [ 3., 4., 5.],
  909. [ 6., 7., nan],
  910. [ 9., 10., nan]])
  911. >>> nanops.nanargmax(arr, axis=1)
  912. array([2, 2, 1, 1])
  913. """
  914. values, mask, _, _, _ = _get_values(values, True, fill_value_typ="-inf", mask=mask)
  915. # error: Need type annotation for 'result'
  916. result = values.argmax(axis) # type: ignore[var-annotated]
  917. result = _maybe_arg_null_out(result, axis, mask, skipna)
  918. return result
  919. @disallow("O")
  920. def nanargmin(
  921. values: np.ndarray,
  922. *,
  923. axis: int | None = None,
  924. skipna: bool = True,
  925. mask: npt.NDArray[np.bool_] | None = None,
  926. ) -> int | np.ndarray:
  927. """
  928. Parameters
  929. ----------
  930. values : ndarray
  931. axis : int, optional
  932. skipna : bool, default True
  933. mask : ndarray[bool], optional
  934. nan-mask if known
  935. Returns
  936. -------
  937. result : int or ndarray[int]
  938. The index/indices of min value in specified axis or -1 in the NA case
  939. Examples
  940. --------
  941. >>> import pandas.core.nanops as nanops
  942. >>> arr = np.array([1, 2, 3, np.nan, 4])
  943. >>> nanops.nanargmin(arr)
  944. 0
  945. >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
  946. >>> arr[2:, 0] = np.nan
  947. >>> arr
  948. array([[ 0., 1., 2.],
  949. [ 3., 4., 5.],
  950. [nan, 7., 8.],
  951. [nan, 10., 11.]])
  952. >>> nanops.nanargmin(arr, axis=1)
  953. array([0, 0, 1, 1])
  954. """
  955. values, mask, _, _, _ = _get_values(values, True, fill_value_typ="+inf", mask=mask)
  956. # error: Need type annotation for 'result'
  957. result = values.argmin(axis) # type: ignore[var-annotated]
  958. result = _maybe_arg_null_out(result, axis, mask, skipna)
  959. return result
  960. @disallow("M8", "m8")
  961. @maybe_operate_rowwise
  962. def nanskew(
  963. values: np.ndarray,
  964. *,
  965. axis: int | None = None,
  966. skipna: bool = True,
  967. mask: npt.NDArray[np.bool_] | None = None,
  968. ) -> float:
  969. """
  970. Compute the sample skewness.
  971. The statistic computed here is the adjusted Fisher-Pearson standardized
  972. moment coefficient G1. The algorithm computes this coefficient directly
  973. from the second and third central moment.
  974. Parameters
  975. ----------
  976. values : ndarray
  977. axis : int, optional
  978. skipna : bool, default True
  979. mask : ndarray[bool], optional
  980. nan-mask if known
  981. Returns
  982. -------
  983. result : float64
  984. Unless input is a float array, in which case use the same
  985. precision as the input array.
  986. Examples
  987. --------
  988. >>> import pandas.core.nanops as nanops
  989. >>> s = pd.Series([1, np.nan, 1, 2])
  990. >>> nanops.nanskew(s)
  991. 1.7320508075688787
  992. """
  993. # error: Incompatible types in assignment (expression has type "Union[Any,
  994. # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
  995. values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
  996. mask = _maybe_get_mask(values, skipna, mask)
  997. if not is_float_dtype(values.dtype):
  998. values = values.astype("f8")
  999. count = _get_counts(values.shape, mask, axis)
  1000. else:
  1001. count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
  1002. if skipna and mask is not None:
  1003. values = values.copy()
  1004. np.putmask(values, mask, 0)
  1005. mean = values.sum(axis, dtype=np.float64) / count
  1006. if axis is not None:
  1007. mean = np.expand_dims(mean, axis)
  1008. adjusted = values - mean
  1009. if skipna and mask is not None:
  1010. np.putmask(adjusted, mask, 0)
  1011. adjusted2 = adjusted ** 2
  1012. adjusted3 = adjusted2 * adjusted
  1013. m2 = adjusted2.sum(axis, dtype=np.float64)
  1014. m3 = adjusted3.sum(axis, dtype=np.float64)
  1015. # floating point error
  1016. #
  1017. # #18044 in _libs/windows.pyx calc_skew follow this behavior
  1018. # to fix the fperr to treat m2 <1e-14 as zero
  1019. m2 = _zero_out_fperr(m2)
  1020. m3 = _zero_out_fperr(m3)
  1021. with np.errstate(invalid="ignore", divide="ignore"):
  1022. result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)
  1023. dtype = values.dtype
  1024. if is_float_dtype(dtype):
  1025. result = result.astype(dtype, copy=False)
  1026. if isinstance(result, np.ndarray):
  1027. result = np.where(m2 == 0, 0, result)
  1028. result[count < 3] = np.nan
  1029. else:
  1030. result = 0 if m2 == 0 else result
  1031. if count < 3:
  1032. return np.nan
  1033. return result
  1034. @disallow("M8", "m8")
  1035. @maybe_operate_rowwise
  1036. def nankurt(
  1037. values: np.ndarray,
  1038. *,
  1039. axis: int | None = None,
  1040. skipna: bool = True,
  1041. mask: npt.NDArray[np.bool_] | None = None,
  1042. ) -> float:
  1043. """
  1044. Compute the sample excess kurtosis
  1045. The statistic computed here is the adjusted Fisher-Pearson standardized
  1046. moment coefficient G2, computed directly from the second and fourth
  1047. central moment.
  1048. Parameters
  1049. ----------
  1050. values : ndarray
  1051. axis : int, optional
  1052. skipna : bool, default True
  1053. mask : ndarray[bool], optional
  1054. nan-mask if known
  1055. Returns
  1056. -------
  1057. result : float64
  1058. Unless input is a float array, in which case use the same
  1059. precision as the input array.
  1060. Examples
  1061. --------
  1062. >>> import pandas.core.nanops as nanops
  1063. >>> s = pd.Series([1, np.nan, 1, 3, 2])
  1064. >>> nanops.nankurt(s)
  1065. -1.2892561983471076
  1066. """
  1067. # error: Incompatible types in assignment (expression has type "Union[Any,
  1068. # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
  1069. values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
  1070. mask = _maybe_get_mask(values, skipna, mask)
  1071. if not is_float_dtype(values.dtype):
  1072. values = values.astype("f8")
  1073. count = _get_counts(values.shape, mask, axis)
  1074. else:
  1075. count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
  1076. if skipna and mask is not None:
  1077. values = values.copy()
  1078. np.putmask(values, mask, 0)
  1079. mean = values.sum(axis, dtype=np.float64) / count
  1080. if axis is not None:
  1081. mean = np.expand_dims(mean, axis)
  1082. adjusted = values - mean
  1083. if skipna and mask is not None:
  1084. np.putmask(adjusted, mask, 0)
  1085. adjusted2 = adjusted ** 2
  1086. adjusted4 = adjusted2 ** 2
  1087. m2 = adjusted2.sum(axis, dtype=np.float64)
  1088. m4 = adjusted4.sum(axis, dtype=np.float64)
  1089. with np.errstate(invalid="ignore", divide="ignore"):
  1090. adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
  1091. numerator = count * (count + 1) * (count - 1) * m4
  1092. denominator = (count - 2) * (count - 3) * m2 ** 2
  1093. # floating point error
  1094. #
  1095. # #18044 in _libs/windows.pyx calc_kurt follow this behavior
  1096. # to fix the fperr to treat denom <1e-14 as zero
  1097. numerator = _zero_out_fperr(numerator)
  1098. denominator = _zero_out_fperr(denominator)
  1099. if not isinstance(denominator, np.ndarray):
  1100. # if ``denom`` is a scalar, check these corner cases first before
  1101. # doing division
  1102. if count < 4:
  1103. return np.nan
  1104. if denominator == 0:
  1105. return 0
  1106. with np.errstate(invalid="ignore", divide="ignore"):
  1107. result = numerator / denominator - adj
  1108. dtype = values.dtype
  1109. if is_float_dtype(dtype):
  1110. result = result.astype(dtype, copy=False)
  1111. if isinstance(result, np.ndarray):
  1112. result = np.where(denominator == 0, 0, result)
  1113. result[count < 4] = np.nan
  1114. return result
  1115. @disallow("M8", "m8")
  1116. @maybe_operate_rowwise
  1117. def nanprod(
  1118. values: np.ndarray,
  1119. *,
  1120. axis: int | None = None,
  1121. skipna: bool = True,
  1122. min_count: int = 0,
  1123. mask: npt.NDArray[np.bool_] | None = None,
  1124. ) -> float:
  1125. """
  1126. Parameters
  1127. ----------
  1128. values : ndarray[dtype]
  1129. axis : int, optional
  1130. skipna : bool, default True
  1131. min_count: int, default 0
  1132. mask : ndarray[bool], optional
  1133. nan-mask if known
  1134. Returns
  1135. -------
  1136. Dtype
  1137. The product of all elements on a given axis. ( NaNs are treated as 1)
  1138. Examples
  1139. --------
  1140. >>> import pandas.core.nanops as nanops
  1141. >>> s = pd.Series([1, 2, 3, np.nan])
  1142. >>> nanops.nanprod(s)
  1143. 6.0
  1144. """
  1145. mask = _maybe_get_mask(values, skipna, mask)
  1146. if skipna and mask is not None:
  1147. values = values.copy()
  1148. values[mask] = 1
  1149. result = values.prod(axis)
  1150. # error: Incompatible return value type (got "Union[ndarray, float]", expected
  1151. # "float")
  1152. return _maybe_null_out( # type: ignore[return-value]
  1153. result, axis, mask, values.shape, min_count=min_count
  1154. )
  1155. def _maybe_arg_null_out(
  1156. result: np.ndarray,
  1157. axis: int | None,
  1158. mask: npt.NDArray[np.bool_] | None,
  1159. skipna: bool,
  1160. ) -> np.ndarray | int:
  1161. # helper function for nanargmin/nanargmax
  1162. if mask is None:
  1163. return result
  1164. if axis is None or not getattr(result, "ndim", False):
  1165. if skipna:
  1166. if mask.all():
  1167. # error: Incompatible types in assignment (expression has type
  1168. # "int", variable has type "ndarray")
  1169. result = -1 # type: ignore[assignment]
  1170. else:
  1171. if mask.any():
  1172. # error: Incompatible types in assignment (expression has type
  1173. # "int", variable has type "ndarray")
  1174. result = -1 # type: ignore[assignment]
  1175. else:
  1176. if skipna:
  1177. na_mask = mask.all(axis)
  1178. else:
  1179. na_mask = mask.any(axis)
  1180. if na_mask.any():
  1181. result[na_mask] = -1
  1182. return result
  1183. def _get_counts(
  1184. values_shape: Shape,
  1185. mask: npt.NDArray[np.bool_] | None,
  1186. axis: int | None,
  1187. dtype: np.dtype = np.dtype(np.float64),
  1188. ) -> int | float | np.ndarray:
  1189. """
  1190. Get the count of non-null values along an axis
  1191. Parameters
  1192. ----------
  1193. values_shape : tuple of int
  1194. shape tuple from values ndarray, used if mask is None
  1195. mask : Optional[ndarray[bool]]
  1196. locations in values that should be considered missing
  1197. axis : Optional[int]
  1198. axis to count along
  1199. dtype : type, optional
  1200. type to use for count
  1201. Returns
  1202. -------
  1203. count : scalar or array
  1204. """
  1205. if axis is None:
  1206. if mask is not None:
  1207. n = mask.size - mask.sum()
  1208. else:
  1209. n = np.prod(values_shape)
  1210. return dtype.type(n)
  1211. if mask is not None:
  1212. count = mask.shape[axis] - mask.sum(axis)
  1213. else:
  1214. count = values_shape[axis]
  1215. if is_scalar(count):
  1216. return dtype.type(count)
  1217. return count.astype(dtype, copy=False)
  1218. def _maybe_null_out(
  1219. result: np.ndarray | float | NaTType,
  1220. axis: int | None,
  1221. mask: npt.NDArray[np.bool_] | None,
  1222. shape: tuple[int, ...],
  1223. min_count: int = 1,
  1224. ) -> np.ndarray | float | NaTType:
  1225. """
  1226. Returns
  1227. -------
  1228. Dtype
  1229. The product of all elements on a given axis. ( NaNs are treated as 1)
  1230. """
  1231. if axis is not None and isinstance(result, np.ndarray):
  1232. if mask is not None:
  1233. null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
  1234. else:
  1235. # we have no nulls, kept mask=None in _maybe_get_mask
  1236. below_count = shape[axis] - min_count < 0
  1237. new_shape = shape[:axis] + shape[axis + 1 :]
  1238. null_mask = np.broadcast_to(below_count, new_shape)
  1239. if np.any(null_mask):
  1240. if is_numeric_dtype(result):
  1241. if np.iscomplexobj(result):
  1242. result = result.astype("c16")
  1243. else:
  1244. result = result.astype("f8")
  1245. result[null_mask] = np.nan
  1246. else:
  1247. # GH12941, use None to auto cast null
  1248. result[null_mask] = None
  1249. elif result is not NaT:
  1250. if check_below_min_count(shape, mask, min_count):
  1251. result = np.nan
  1252. return result
  1253. def check_below_min_count(
  1254. shape: tuple[int, ...], mask: npt.NDArray[np.bool_] | None, min_count: int
  1255. ) -> bool:
  1256. """
  1257. Check for the `min_count` keyword. Returns True if below `min_count` (when
  1258. missing value should be returned from the reduction).
  1259. Parameters
  1260. ----------
  1261. shape : tuple
  1262. The shape of the values (`values.shape`).
  1263. mask : ndarray[bool] or None
  1264. Boolean numpy array (typically of same shape as `shape`) or None.
  1265. min_count : int
  1266. Keyword passed through from sum/prod call.
  1267. Returns
  1268. -------
  1269. bool
  1270. """
  1271. if min_count > 0:
  1272. if mask is None:
  1273. # no missing values, only check size
  1274. non_nulls = np.prod(shape)
  1275. else:
  1276. non_nulls = mask.size - mask.sum()
  1277. if non_nulls < min_count:
  1278. return True
  1279. return False
  1280. def _zero_out_fperr(arg):
  1281. # #18044 reference this behavior to fix rolling skew/kurt issue
  1282. if isinstance(arg, np.ndarray):
  1283. with np.errstate(invalid="ignore"):
  1284. return np.where(np.abs(arg) < 1e-14, 0, arg)
  1285. else:
  1286. return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
  1287. @disallow("M8", "m8")
  1288. def nancorr(
  1289. a: np.ndarray, b: np.ndarray, *, method="pearson", min_periods: int | None = None
  1290. ):
  1291. """
  1292. a, b: ndarrays
  1293. """
  1294. if len(a) != len(b):
  1295. raise AssertionError("Operands to nancorr must have same size")
  1296. if min_periods is None:
  1297. min_periods = 1
  1298. valid = notna(a) & notna(b)
  1299. if not valid.all():
  1300. a = a[valid]
  1301. b = b[valid]
  1302. if len(a) < min_periods:
  1303. return np.nan
  1304. f = get_corr_func(method)
  1305. return f(a, b)
  1306. def get_corr_func(method):
  1307. if method == "kendall":
  1308. from scipy.stats import kendalltau
  1309. def func(a, b):
  1310. return kendalltau(a, b)[0]
  1311. return func
  1312. elif method == "spearman":
  1313. from scipy.stats import spearmanr
  1314. def func(a, b):
  1315. return spearmanr(a, b)[0]
  1316. return func
  1317. elif method == "pearson":
  1318. def func(a, b):
  1319. return np.corrcoef(a, b)[0, 1]
  1320. return func
  1321. elif callable(method):
  1322. return method
  1323. raise ValueError(
  1324. f"Unknown method '{method}', expected one of "
  1325. "'kendall', 'spearman', 'pearson', or callable"
  1326. )
  1327. @disallow("M8", "m8")
  1328. def nancov(
  1329. a: np.ndarray,
  1330. b: np.ndarray,
  1331. *,
  1332. min_periods: int | None = None,
  1333. ddof: int | None = 1,
  1334. ):
  1335. if len(a) != len(b):
  1336. raise AssertionError("Operands to nancov must have same size")
  1337. if min_periods is None:
  1338. min_periods = 1
  1339. valid = notna(a) & notna(b)
  1340. if not valid.all():
  1341. a = a[valid]
  1342. b = b[valid]
  1343. if len(a) < min_periods:
  1344. return np.nan
  1345. return np.cov(a, b, ddof=ddof)[0, 1]
  1346. def _ensure_numeric(x):
  1347. if isinstance(x, np.ndarray):
  1348. if is_integer_dtype(x) or is_bool_dtype(x):
  1349. x = x.astype(np.float64)
  1350. elif is_object_dtype(x):
  1351. try:
  1352. x = x.astype(np.complex128)
  1353. except (TypeError, ValueError):
  1354. try:
  1355. x = x.astype(np.float64)
  1356. except ValueError as err:
  1357. # GH#29941 we get here with object arrays containing strs
  1358. raise TypeError(f"Could not convert {x} to numeric") from err
  1359. else:
  1360. if not np.any(np.imag(x)):
  1361. x = x.real
  1362. elif not (is_float(x) or is_integer(x) or is_complex(x)):
  1363. try:
  1364. x = float(x)
  1365. except (TypeError, ValueError):
  1366. # e.g. "1+1j" or "foo"
  1367. try:
  1368. x = complex(x)
  1369. except ValueError as err:
  1370. # e.g. "foo"
  1371. raise TypeError(f"Could not convert {x} to numeric") from err
  1372. return x
  1373. # NA-friendly array comparisons
  1374. def make_nancomp(op):
  1375. def f(x, y):
  1376. xmask = isna(x)
  1377. ymask = isna(y)
  1378. mask = xmask | ymask
  1379. with np.errstate(all="ignore"):
  1380. result = op(x, y)
  1381. if mask.any():
  1382. if is_bool_dtype(result):
  1383. result = result.astype("O")
  1384. np.putmask(result, mask, np.nan)
  1385. return result
  1386. return f
  1387. nangt = make_nancomp(operator.gt)
  1388. nange = make_nancomp(operator.ge)
  1389. nanlt = make_nancomp(operator.lt)
  1390. nanle = make_nancomp(operator.le)
  1391. naneq = make_nancomp(operator.eq)
  1392. nanne = make_nancomp(operator.ne)
  1393. def _nanpercentile_1d(
  1394. values: np.ndarray,
  1395. mask: npt.NDArray[np.bool_],
  1396. q: np.ndarray,
  1397. na_value: Scalar,
  1398. interpolation,
  1399. ) -> Scalar | np.ndarray:
  1400. """
  1401. Wrapper for np.percentile that skips missing values, specialized to
  1402. 1-dimensional case.
  1403. Parameters
  1404. ----------
  1405. values : array over which to find quantiles
  1406. mask : ndarray[bool]
  1407. locations in values that should be considered missing
  1408. q : np.ndarray[float64] of quantile indices to find
  1409. na_value : scalar
  1410. value to return for empty or all-null values
  1411. interpolation : str
  1412. Returns
  1413. -------
  1414. quantiles : scalar or array
  1415. """
  1416. # mask is Union[ExtensionArray, ndarray]
  1417. values = values[~mask]
  1418. if len(values) == 0:
  1419. return np.array([na_value] * len(q), dtype=values.dtype)
  1420. return np.percentile(values, q, interpolation=interpolation)
  1421. def nanpercentile(
  1422. values: np.ndarray,
  1423. q: np.ndarray,
  1424. *,
  1425. na_value,
  1426. mask: npt.NDArray[np.bool_],
  1427. interpolation,
  1428. ):
  1429. """
  1430. Wrapper for np.percentile that skips missing values.
  1431. Parameters
  1432. ----------
  1433. values : np.ndarray[ndim=2] over which to find quantiles
  1434. q : np.ndarray[float64] of quantile indices to find
  1435. na_value : scalar
  1436. value to return for empty or all-null values
  1437. mask : ndarray[bool]
  1438. locations in values that should be considered missing
  1439. interpolation : str
  1440. Returns
  1441. -------
  1442. quantiles : scalar or array
  1443. """
  1444. if values.dtype.kind in ["m", "M"]:
  1445. # need to cast to integer to avoid rounding errors in numpy
  1446. result = nanpercentile(
  1447. values.view("i8"),
  1448. q=q,
  1449. na_value=na_value.view("i8"),
  1450. mask=mask,
  1451. interpolation=interpolation,
  1452. )
  1453. # Note: we have to do `astype` and not view because in general we
  1454. # have float result at this point, not i8
  1455. return result.astype(values.dtype)
  1456. if not lib.is_scalar(mask) and mask.any():
  1457. # Caller is responsible for ensuring mask shape match
  1458. assert mask.shape == values.shape
  1459. result = [
  1460. _nanpercentile_1d(val, m, q, na_value, interpolation=interpolation)
  1461. for (val, m) in zip(list(values), list(mask))
  1462. ]
  1463. result = np.array(result, dtype=values.dtype, copy=False).T
  1464. return result
  1465. else:
  1466. return np.percentile(values, q, axis=1, interpolation=interpolation)
  1467. def na_accum_func(values: ArrayLike, accum_func, *, skip

Large files files are truncated, but you can click here to view the full file