PageRenderTime 33ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/tests/arrays/sparse/test_array.py

https://github.com/pydata/pandas
Python | 472 lines | 355 code | 81 blank | 36 comment | 11 complexity | 09ca3e321fdfdb54d03d7d3a87af980a MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import re
  2. import warnings
  3. import numpy as np
  4. import pytest
  5. from pandas._libs.sparse import IntIndex
  6. import pandas as pd
  7. from pandas import isna
  8. import pandas._testing as tm
  9. from pandas.core.api import Int64Index
  10. from pandas.core.arrays.sparse import (
  11. SparseArray,
  12. SparseDtype,
  13. )
  14. @pytest.fixture
  15. def arr_data():
  16. """Fixture returning numpy array with valid and missing entries"""
  17. return np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
  18. @pytest.fixture
  19. def arr(arr_data):
  20. """Fixture returning SparseArray from 'arr_data'"""
  21. return SparseArray(arr_data)
  22. @pytest.fixture
  23. def zarr():
  24. """Fixture returning SparseArray with integer entries and 'fill_value=0'"""
  25. return SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)
  26. class TestSparseArray:
  27. @pytest.mark.parametrize("fill_value", [0, None, np.nan])
  28. def test_shift_fill_value(self, fill_value):
  29. # GH #24128
  30. sparse = SparseArray(np.array([1, 0, 0, 3, 0]), fill_value=8.0)
  31. res = sparse.shift(1, fill_value=fill_value)
  32. if isna(fill_value):
  33. fill_value = res.dtype.na_value
  34. exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0)
  35. tm.assert_sp_array_equal(res, exp)
  36. def test_set_fill_value(self):
  37. arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan)
  38. arr.fill_value = 2
  39. assert arr.fill_value == 2
  40. arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64)
  41. arr.fill_value = 2
  42. assert arr.fill_value == 2
  43. # TODO: this seems fine? You can construct an integer
  44. # sparsearray with NaN fill value, why not update one?
  45. # coerces to int
  46. # msg = "unable to set fill_value 3\\.1 to int64 dtype"
  47. # with pytest.raises(ValueError, match=msg):
  48. arr.fill_value = 3.1
  49. assert arr.fill_value == 3.1
  50. # msg = "unable to set fill_value nan to int64 dtype"
  51. # with pytest.raises(ValueError, match=msg):
  52. arr.fill_value = np.nan
  53. assert np.isnan(arr.fill_value)
  54. arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
  55. arr.fill_value = True
  56. assert arr.fill_value
  57. # FIXME: don't leave commented-out
  58. # coerces to bool
  59. # TODO: we can construct an sparse array of bool
  60. # type and use as fill_value any value
  61. # msg = "fill_value must be True, False or nan"
  62. # with pytest.raises(ValueError, match=msg):
  63. # arr.fill_value = 0
  64. # msg = "unable to set fill_value nan to bool dtype"
  65. # with pytest.raises(ValueError, match=msg):
  66. arr.fill_value = np.nan
  67. assert np.isnan(arr.fill_value)
  68. @pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)])
  69. def test_set_fill_invalid_non_scalar(self, val):
  70. arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
  71. msg = "fill_value must be a scalar"
  72. with pytest.raises(ValueError, match=msg):
  73. arr.fill_value = val
  74. def test_copy(self, arr):
  75. arr2 = arr.copy()
  76. assert arr2.sp_values is not arr.sp_values
  77. assert arr2.sp_index is arr.sp_index
  78. def test_values_asarray(self, arr_data, arr):
  79. tm.assert_almost_equal(arr.to_dense(), arr_data)
  80. @pytest.mark.parametrize(
  81. "data,shape,dtype",
  82. [
  83. ([0, 0, 0, 0, 0], (5,), None),
  84. ([], (0,), None),
  85. ([0], (1,), None),
  86. (["A", "A", np.nan, "B"], (4,), object),
  87. ],
  88. )
  89. def test_shape(self, data, shape, dtype):
  90. # GH 21126
  91. out = SparseArray(data, dtype=dtype)
  92. assert out.shape == shape
  93. @pytest.mark.parametrize(
  94. "vals",
  95. [
  96. [np.nan, np.nan, np.nan, np.nan, np.nan],
  97. [1, np.nan, np.nan, 3, np.nan],
  98. [1, np.nan, 0, 3, 0],
  99. ],
  100. )
  101. @pytest.mark.parametrize("fill_value", [None, 0])
  102. def test_dense_repr(self, vals, fill_value):
  103. vals = np.array(vals)
  104. arr = SparseArray(vals, fill_value=fill_value)
  105. res = arr.to_dense()
  106. tm.assert_numpy_array_equal(res, vals)
  107. @pytest.mark.parametrize("fix", ["arr", "zarr"])
  108. def test_pickle(self, fix, request):
  109. obj = request.getfixturevalue(fix)
  110. unpickled = tm.round_trip_pickle(obj)
  111. tm.assert_sp_array_equal(unpickled, obj)
  112. def test_generator_warnings(self):
  113. sp_arr = SparseArray([1, 2, 3])
  114. with warnings.catch_warnings(record=True) as w:
  115. warnings.filterwarnings(action="always", category=DeprecationWarning)
  116. warnings.filterwarnings(action="always", category=PendingDeprecationWarning)
  117. for _ in sp_arr:
  118. pass
  119. assert len(w) == 0
  120. def test_where_retain_fill_value(self):
  121. # GH#45691 don't lose fill_value on _where
  122. arr = SparseArray([np.nan, 1.0], fill_value=0)
  123. mask = np.array([True, False])
  124. res = arr._where(~mask, 1)
  125. exp = SparseArray([1, 1.0], fill_value=0)
  126. tm.assert_sp_array_equal(res, exp)
  127. ser = pd.Series(arr)
  128. res = ser.where(~mask, 1)
  129. tm.assert_series_equal(res, pd.Series(exp))
  130. def test_fillna(self):
  131. s = SparseArray([1, np.nan, np.nan, 3, np.nan])
  132. res = s.fillna(-1)
  133. exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64)
  134. tm.assert_sp_array_equal(res, exp)
  135. s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
  136. res = s.fillna(-1)
  137. exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64)
  138. tm.assert_sp_array_equal(res, exp)
  139. s = SparseArray([1, np.nan, 0, 3, 0])
  140. res = s.fillna(-1)
  141. exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64)
  142. tm.assert_sp_array_equal(res, exp)
  143. s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0)
  144. res = s.fillna(-1)
  145. exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64)
  146. tm.assert_sp_array_equal(res, exp)
  147. s = SparseArray([np.nan, np.nan, np.nan, np.nan])
  148. res = s.fillna(-1)
  149. exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64)
  150. tm.assert_sp_array_equal(res, exp)
  151. s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0)
  152. res = s.fillna(-1)
  153. exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64)
  154. tm.assert_sp_array_equal(res, exp)
  155. # float dtype's fill_value is np.nan, replaced by -1
  156. s = SparseArray([0.0, 0.0, 0.0, 0.0])
  157. res = s.fillna(-1)
  158. exp = SparseArray([0.0, 0.0, 0.0, 0.0], fill_value=-1)
  159. tm.assert_sp_array_equal(res, exp)
  160. # int dtype shouldn't have missing. No changes.
  161. s = SparseArray([0, 0, 0, 0])
  162. assert s.dtype == SparseDtype(np.int64)
  163. assert s.fill_value == 0
  164. res = s.fillna(-1)
  165. tm.assert_sp_array_equal(res, s)
  166. s = SparseArray([0, 0, 0, 0], fill_value=0)
  167. assert s.dtype == SparseDtype(np.int64)
  168. assert s.fill_value == 0
  169. res = s.fillna(-1)
  170. exp = SparseArray([0, 0, 0, 0], fill_value=0)
  171. tm.assert_sp_array_equal(res, exp)
  172. # fill_value can be nan if there is no missing hole.
  173. # only fill_value will be changed
  174. s = SparseArray([0, 0, 0, 0], fill_value=np.nan)
  175. assert s.dtype == SparseDtype(np.int64, fill_value=np.nan)
  176. assert np.isnan(s.fill_value)
  177. res = s.fillna(-1)
  178. exp = SparseArray([0, 0, 0, 0], fill_value=-1)
  179. tm.assert_sp_array_equal(res, exp)
  180. def test_fillna_overlap(self):
  181. s = SparseArray([1, np.nan, np.nan, 3, np.nan])
  182. # filling with existing value doesn't replace existing value with
  183. # fill_value, i.e. existing 3 remains in sp_values
  184. res = s.fillna(3)
  185. exp = np.array([1, 3, 3, 3, 3], dtype=np.float64)
  186. tm.assert_numpy_array_equal(res.to_dense(), exp)
  187. s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
  188. res = s.fillna(3)
  189. exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64)
  190. tm.assert_sp_array_equal(res, exp)
  191. def test_nonzero(self):
  192. # Tests regression #21172.
  193. sa = SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
  194. expected = np.array([2, 5, 9], dtype=np.int32)
  195. (result,) = sa.nonzero()
  196. tm.assert_numpy_array_equal(expected, result)
  197. sa = SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
  198. (result,) = sa.nonzero()
  199. tm.assert_numpy_array_equal(expected, result)
  200. class TestSparseArrayAnalytics:
  201. @pytest.mark.parametrize(
  202. "data,expected",
  203. [
  204. (
  205. np.array([1, 2, 3, 4, 5], dtype=float), # non-null data
  206. SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0])),
  207. ),
  208. (
  209. np.array([1, 2, np.nan, 4, 5], dtype=float), # null data
  210. SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])),
  211. ),
  212. ],
  213. )
  214. @pytest.mark.parametrize("numpy", [True, False])
  215. def test_cumsum(self, data, expected, numpy):
  216. cumsum = np.cumsum if numpy else lambda s: s.cumsum()
  217. out = cumsum(SparseArray(data))
  218. tm.assert_sp_array_equal(out, expected)
  219. out = cumsum(SparseArray(data, fill_value=np.nan))
  220. tm.assert_sp_array_equal(out, expected)
  221. out = cumsum(SparseArray(data, fill_value=2))
  222. tm.assert_sp_array_equal(out, expected)
  223. if numpy: # numpy compatibility checks.
  224. msg = "the 'dtype' parameter is not supported"
  225. with pytest.raises(ValueError, match=msg):
  226. np.cumsum(SparseArray(data), dtype=np.int64)
  227. msg = "the 'out' parameter is not supported"
  228. with pytest.raises(ValueError, match=msg):
  229. np.cumsum(SparseArray(data), out=out)
  230. else:
  231. axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid.
  232. msg = re.escape(f"axis(={axis}) out of bounds")
  233. with pytest.raises(ValueError, match=msg):
  234. SparseArray(data).cumsum(axis=axis)
  235. def test_ufunc(self):
  236. # GH 13853 make sure ufunc is applied to fill_value
  237. sparse = SparseArray([1, np.nan, 2, np.nan, -2])
  238. result = SparseArray([1, np.nan, 2, np.nan, 2])
  239. tm.assert_sp_array_equal(abs(sparse), result)
  240. tm.assert_sp_array_equal(np.abs(sparse), result)
  241. sparse = SparseArray([1, -1, 2, -2], fill_value=1)
  242. result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1)
  243. tm.assert_sp_array_equal(abs(sparse), result)
  244. tm.assert_sp_array_equal(np.abs(sparse), result)
  245. sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
  246. exp = SparseArray([1, 1, 2, 2], fill_value=1)
  247. tm.assert_sp_array_equal(abs(sparse), exp)
  248. tm.assert_sp_array_equal(np.abs(sparse), exp)
  249. sparse = SparseArray([1, np.nan, 2, np.nan, -2])
  250. result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))
  251. tm.assert_sp_array_equal(np.sin(sparse), result)
  252. sparse = SparseArray([1, -1, 2, -2], fill_value=1)
  253. result = SparseArray(np.sin([1, -1, 2, -2]), fill_value=np.sin(1))
  254. tm.assert_sp_array_equal(np.sin(sparse), result)
  255. sparse = SparseArray([1, -1, 0, -2], fill_value=0)
  256. result = SparseArray(np.sin([1, -1, 0, -2]), fill_value=np.sin(0))
  257. tm.assert_sp_array_equal(np.sin(sparse), result)
  258. def test_ufunc_args(self):
  259. # GH 13853 make sure ufunc is applied to fill_value, including its arg
  260. sparse = SparseArray([1, np.nan, 2, np.nan, -2])
  261. result = SparseArray([2, np.nan, 3, np.nan, -1])
  262. tm.assert_sp_array_equal(np.add(sparse, 1), result)
  263. sparse = SparseArray([1, -1, 2, -2], fill_value=1)
  264. result = SparseArray([2, 0, 3, -1], fill_value=2)
  265. tm.assert_sp_array_equal(np.add(sparse, 1), result)
  266. sparse = SparseArray([1, -1, 0, -2], fill_value=0)
  267. result = SparseArray([2, 0, 1, -1], fill_value=1)
  268. tm.assert_sp_array_equal(np.add(sparse, 1), result)
  269. @pytest.mark.parametrize("fill_value", [0.0, np.nan])
  270. def test_modf(self, fill_value):
  271. # https://github.com/pandas-dev/pandas/issues/26946
  272. sparse = SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value)
  273. r1, r2 = np.modf(sparse)
  274. e1, e2 = np.modf(np.asarray(sparse))
  275. tm.assert_sp_array_equal(r1, SparseArray(e1, fill_value=fill_value))
  276. tm.assert_sp_array_equal(r2, SparseArray(e2, fill_value=fill_value))
  277. def test_nbytes_integer(self):
  278. arr = SparseArray([1, 0, 0, 0, 2], kind="integer")
  279. result = arr.nbytes
  280. # (2 * 8) + 2 * 4
  281. assert result == 24
  282. def test_nbytes_block(self):
  283. arr = SparseArray([1, 2, 0, 0, 0], kind="block")
  284. result = arr.nbytes
  285. # (2 * 8) + 4 + 4
  286. # sp_values, blocs, blengths
  287. assert result == 24
  288. def test_asarray_datetime64(self):
  289. s = SparseArray(pd.to_datetime(["2012", None, None, "2013"]))
  290. np.asarray(s)
  291. def test_density(self):
  292. arr = SparseArray([0, 1])
  293. assert arr.density == 0.5
  294. def test_npoints(self):
  295. arr = SparseArray([0, 1])
  296. assert arr.npoints == 1
  297. def test_setting_fill_value_fillna_still_works():
  298. # This is why letting users update fill_value / dtype is bad
  299. # astype has the same problem.
  300. arr = SparseArray([1.0, np.nan, 1.0], fill_value=0.0)
  301. arr.fill_value = np.nan
  302. result = arr.isna()
  303. # Can't do direct comparison, since the sp_index will be different
  304. # So let's convert to ndarray and check there.
  305. result = np.asarray(result)
  306. expected = np.array([False, True, False])
  307. tm.assert_numpy_array_equal(result, expected)
  308. def test_setting_fill_value_updates():
  309. arr = SparseArray([0.0, np.nan], fill_value=0)
  310. arr.fill_value = np.nan
  311. # use private constructor to get the index right
  312. # otherwise both nans would be un-stored.
  313. expected = SparseArray._simple_new(
  314. sparse_array=np.array([np.nan]),
  315. sparse_index=IntIndex(2, [1]),
  316. dtype=SparseDtype(float, np.nan),
  317. )
  318. tm.assert_sp_array_equal(arr, expected)
  319. @pytest.mark.parametrize(
  320. "arr, loc",
  321. [
  322. ([None, 1, 2], 0),
  323. ([0, None, 2], 1),
  324. ([0, 1, None], 2),
  325. ([0, 1, 1, None, None], 3),
  326. ([1, 1, 1, 2], -1),
  327. ([], -1),
  328. ],
  329. )
  330. def test_first_fill_value_loc(arr, loc):
  331. result = SparseArray(arr)._first_fill_value_loc()
  332. assert result == loc
  333. @pytest.mark.parametrize(
  334. "arr", [[1, 2, np.nan, np.nan], [1, np.nan, 2, np.nan], [1, 2, np.nan]]
  335. )
  336. @pytest.mark.parametrize("fill_value", [np.nan, 0, 1])
  337. def test_unique_na_fill(arr, fill_value):
  338. a = SparseArray(arr, fill_value=fill_value).unique()
  339. b = pd.Series(arr).unique()
  340. assert isinstance(a, SparseArray)
  341. a = np.asarray(a)
  342. tm.assert_numpy_array_equal(a, b)
  343. def test_unique_all_sparse():
  344. # https://github.com/pandas-dev/pandas/issues/23168
  345. arr = SparseArray([0, 0])
  346. result = arr.unique()
  347. expected = SparseArray([0])
  348. tm.assert_sp_array_equal(result, expected)
  349. def test_map():
  350. arr = SparseArray([0, 1, 2])
  351. expected = SparseArray([10, 11, 12], fill_value=10)
  352. # dict
  353. result = arr.map({0: 10, 1: 11, 2: 12})
  354. tm.assert_sp_array_equal(result, expected)
  355. # series
  356. result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
  357. tm.assert_sp_array_equal(result, expected)
  358. # function
  359. result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
  360. expected = SparseArray([10, 11, 12], fill_value=10)
  361. tm.assert_sp_array_equal(result, expected)
  362. def test_map_missing():
  363. arr = SparseArray([0, 1, 2])
  364. expected = SparseArray([10, 11, None], fill_value=10)
  365. result = arr.map({0: 10, 1: 11})
  366. tm.assert_sp_array_equal(result, expected)
  367. @pytest.mark.parametrize("fill_value", [np.nan, 1])
  368. def test_dropna(fill_value):
  369. # GH-28287
  370. arr = SparseArray([np.nan, 1], fill_value=fill_value)
  371. exp = SparseArray([1.0], fill_value=fill_value)
  372. tm.assert_sp_array_equal(arr.dropna(), exp)
  373. df = pd.DataFrame({"a": [0, 1], "b": arr})
  374. expected_df = pd.DataFrame({"a": [1], "b": exp}, index=Int64Index([1]))
  375. tm.assert_equal(df.dropna(), expected_df)
  376. def test_drop_duplicates_fill_value():
  377. # GH 11726
  378. df = pd.DataFrame(np.zeros((5, 5))).apply(lambda x: SparseArray(x, fill_value=0))
  379. result = df.drop_duplicates()
  380. expected = pd.DataFrame({i: SparseArray([0.0], fill_value=0) for i in range(5)})
  381. tm.assert_frame_equal(result, expected)