PageRenderTime 50ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/sparse/array.py

http://github.com/wesm/pandas
Python | 753 lines | 679 code | 28 blank | 46 comment | 31 complexity | 51062ea59eed372167f748aa5f342da6 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. SparseArray data structure
  3. """
  4. from __future__ import division
  5. # pylint: disable=E1101,E1103,W0231
  6. from numpy import nan, ndarray
  7. import numpy as np
  8. import pandas as pd
  9. from pandas.core.base import PandasObject
  10. from pandas import compat, lib
  11. from pandas.compat import range
  12. from pandas.compat.numpy import function as nv
  13. from pandas.types.generic import ABCSparseArray, ABCSparseSeries
  14. from pandas.types.common import (is_float, is_integer,
  15. is_integer_dtype, _ensure_platform_int,
  16. is_bool_dtype,
  17. is_list_like,
  18. is_scalar, is_dtype_equal)
  19. from pandas.types.cast import (_possibly_convert_platform, _maybe_promote,
  20. _astype_nansafe)
  21. from pandas.types.missing import isnull, notnull
  22. from pandas._sparse import SparseIndex, BlockIndex, IntIndex
  23. import pandas._sparse as splib
  24. import pandas.index as _index
  25. import pandas.core.algorithms as algos
  26. import pandas.core.ops as ops
  27. import pandas.formats.printing as printing
  28. from pandas.util.decorators import Appender
  29. from pandas.indexes.base import _index_shared_docs
  30. _sparray_doc_kwargs = dict(klass='SparseArray')
  31. def _arith_method(op, name, str_rep=None, default_axis=None, fill_zeros=None,
  32. **eval_kwargs):
  33. """
  34. Wrapper function for Series arithmetic operations, to avoid
  35. code duplication.
  36. """
  37. def wrapper(self, other):
  38. if isinstance(other, np.ndarray):
  39. if len(self) != len(other):
  40. raise AssertionError("length mismatch: %d vs. %d" %
  41. (len(self), len(other)))
  42. if not isinstance(other, ABCSparseArray):
  43. dtype = getattr(other, 'dtype', None)
  44. other = SparseArray(other, fill_value=self.fill_value,
  45. dtype=dtype)
  46. return _sparse_array_op(self, other, op, name)
  47. elif is_scalar(other):
  48. fill = op(_get_fill(self), np.asarray(other))
  49. return _wrap_result(name, op(self.sp_values, other),
  50. self.sp_index, fill)
  51. else: # pragma: no cover
  52. raise TypeError('operation with %s not supported' % type(other))
  53. if name.startswith("__"):
  54. name = name[2:-2]
  55. wrapper.__name__ = name
  56. return wrapper
  57. def _maybe_match_dtype(left, right):
  58. if not hasattr(right, 'dtype'):
  59. return left.dtype
  60. elif left.dtype == right.dtype:
  61. return getattr(left.dtype, '__name__', left.dtype)
  62. else:
  63. # ToDo: to be supported after GH 667
  64. raise NotImplementedError('dtypes must be identical')
  65. def _get_fill(arr):
  66. # coerce fill_value to arr dtype if possible
  67. # int64 SparseArray can have NaN as fill_value if there is no missing
  68. try:
  69. return np.asarray(arr.fill_value, dtype=arr.dtype)
  70. except ValueError:
  71. return np.asarray(arr.fill_value)
  72. def _sparse_array_op(left, right, op, name, series=False):
  73. if series and is_integer_dtype(left) and is_integer_dtype(right):
  74. # series coerces to float64 if result should have NaN/inf
  75. if name in ('floordiv', 'mod') and (right.values == 0).any():
  76. left = left.astype(np.float64)
  77. right = right.astype(np.float64)
  78. elif name in ('rfloordiv', 'rmod') and (left.values == 0).any():
  79. left = left.astype(np.float64)
  80. right = right.astype(np.float64)
  81. dtype = _maybe_match_dtype(left, right)
  82. if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
  83. result = op(left.get_values(), right.get_values())
  84. if left.sp_index.ngaps == 0:
  85. index = left.sp_index
  86. else:
  87. index = right.sp_index
  88. fill = op(_get_fill(left), _get_fill(right))
  89. elif left.sp_index.equals(right.sp_index):
  90. result = op(left.sp_values, right.sp_values)
  91. index = left.sp_index
  92. fill = op(_get_fill(left), _get_fill(right))
  93. else:
  94. if name[0] == 'r':
  95. left, right = right, left
  96. name = name[1:]
  97. opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype)
  98. sparse_op = getattr(splib, opname)
  99. result, index, fill = sparse_op(left.sp_values, left.sp_index,
  100. left.fill_value, right.sp_values,
  101. right.sp_index, right.fill_value)
  102. return _wrap_result(name, result, index, fill, dtype=result.dtype)
  103. def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
  104. """ wrap op result to have correct dtype """
  105. if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
  106. # ToDo: We can remove this condition when removing
  107. # SparseArray's dtype default when closing GH 667
  108. dtype = np.bool
  109. elif name == 'truediv':
  110. dtype = np.float64
  111. return SparseArray(data, sparse_index=sparse_index,
  112. fill_value=fill_value, dtype=dtype)
  113. class SparseArray(PandasObject, np.ndarray):
  114. """Data structure for labeled, sparse floating point 1-D data
  115. Parameters
  116. ----------
  117. data : {array-like (1-D), Series, SparseSeries, dict}
  118. kind : {'block', 'integer'}
  119. fill_value : float
  120. Defaults to NaN (code for missing)
  121. sparse_index : {BlockIndex, IntIndex}, optional
  122. Only if you have one. Mainly used internally
  123. Notes
  124. -----
  125. SparseArray objects are immutable via the typical Python means. If you
  126. must change values, convert to dense, make your changes, then convert back
  127. to sparse
  128. """
  129. __array_priority__ = 15
  130. _typ = 'array'
  131. _subtyp = 'sparse_array'
  132. sp_index = None
  133. fill_value = None
  134. def __new__(cls, data, sparse_index=None, index=None, kind='integer',
  135. fill_value=None, dtype=np.float64, copy=False):
  136. if index is not None:
  137. if data is None:
  138. data = np.nan
  139. if not is_scalar(data):
  140. raise Exception("must only pass scalars with an index ")
  141. values = np.empty(len(index), dtype='float64')
  142. values.fill(data)
  143. data = values
  144. if dtype is not None:
  145. dtype = np.dtype(dtype)
  146. is_sparse_array = isinstance(data, SparseArray)
  147. if fill_value is None:
  148. if is_sparse_array:
  149. fill_value = data.fill_value
  150. else:
  151. fill_value = nan
  152. if is_sparse_array:
  153. sparse_index = data.sp_index
  154. values = np.asarray(data)
  155. else:
  156. # array-like
  157. if sparse_index is None:
  158. values, sparse_index = make_sparse(data, kind=kind,
  159. fill_value=fill_value)
  160. else:
  161. values = _sanitize_values(data)
  162. if len(values) != sparse_index.npoints:
  163. raise AssertionError("Non array-like type {0} must have"
  164. " the same length as the"
  165. " index".format(type(values)))
  166. # Create array, do *not* copy data by default
  167. if copy:
  168. try:
  169. # ToDo: Can remove this error handling when we actually
  170. # support other dtypes
  171. subarr = np.array(values, dtype=dtype, copy=True)
  172. except ValueError:
  173. subarr = np.array(values, copy=True)
  174. else:
  175. try:
  176. subarr = np.asarray(values, dtype=dtype)
  177. except ValueError:
  178. subarr = np.asarray(values)
  179. # if we have a bool type, make sure that we have a bool fill_value
  180. if ((dtype is not None and issubclass(dtype.type, np.bool_)) or
  181. (data is not None and lib.is_bool_array(subarr))):
  182. if np.isnan(fill_value) or not fill_value:
  183. fill_value = False
  184. else:
  185. fill_value = bool(fill_value)
  186. # Change the class of the array to be the subclass type.
  187. return cls._simple_new(subarr, sparse_index, fill_value)
  188. @classmethod
  189. def _simple_new(cls, data, sp_index, fill_value):
  190. if (is_integer_dtype(data) and is_float(fill_value) and
  191. sp_index.ngaps > 0):
  192. # if float fill_value is being included in dense repr,
  193. # convert values to float
  194. data = data.astype(float)
  195. result = data.view(cls)
  196. if not isinstance(sp_index, SparseIndex):
  197. # caller must pass SparseIndex
  198. raise ValueError('sp_index must be a SparseIndex')
  199. result.sp_index = sp_index
  200. result._fill_value = fill_value
  201. return result
  202. @property
  203. def _constructor(self):
  204. return lambda x: SparseArray(x, fill_value=self.fill_value,
  205. kind=self.kind)
  206. @property
  207. def kind(self):
  208. if isinstance(self.sp_index, BlockIndex):
  209. return 'block'
  210. elif isinstance(self.sp_index, IntIndex):
  211. return 'integer'
  212. def __array_wrap__(self, out_arr, context=None):
  213. """
  214. NumPy calls this method when ufunc is applied
  215. Parameters
  216. ----------
  217. out_arr : ndarray
  218. ufunc result (note that ufunc is only applied to sp_values)
  219. context : tuple of 3 elements (ufunc, signature, domain)
  220. for example, following is a context when np.sin is applied to
  221. SparseArray,
  222. (<ufunc 'sin'>, (SparseArray,), 0))
  223. See http://docs.scipy.org/doc/numpy/user/basics.subclassing.html
  224. """
  225. if isinstance(context, tuple) and len(context) == 3:
  226. ufunc, args, domain = context
  227. # to apply ufunc only to fill_value (to avoid recursive call)
  228. args = [getattr(a, 'fill_value', a) for a in args]
  229. fill_value = ufunc(self.fill_value, *args[1:])
  230. else:
  231. fill_value = self.fill_value
  232. return self._simple_new(out_arr, sp_index=self.sp_index,
  233. fill_value=fill_value)
  234. def __array_finalize__(self, obj):
  235. """
  236. Gets called after any ufunc or other array operations, necessary
  237. to pass on the index.
  238. """
  239. self.sp_index = getattr(obj, 'sp_index', None)
  240. self._fill_value = getattr(obj, 'fill_value', None)
  241. def __reduce__(self):
  242. """Necessary for making this object picklable"""
  243. object_state = list(ndarray.__reduce__(self))
  244. subclass_state = self.fill_value, self.sp_index
  245. object_state[2] = (object_state[2], subclass_state)
  246. return tuple(object_state)
  247. def __setstate__(self, state):
  248. """Necessary for making this object picklable"""
  249. nd_state, own_state = state
  250. ndarray.__setstate__(self, nd_state)
  251. fill_value, sp_index = own_state[:2]
  252. self.sp_index = sp_index
  253. self._fill_value = fill_value
  254. def __len__(self):
  255. try:
  256. return self.sp_index.length
  257. except:
  258. return 0
  259. def __unicode__(self):
  260. return '%s\nFill: %s\n%s' % (printing.pprint_thing(self),
  261. printing.pprint_thing(self.fill_value),
  262. printing.pprint_thing(self.sp_index))
  263. def disable(self, other):
  264. raise NotImplementedError('inplace binary ops not supported')
  265. # Inplace operators
  266. __iadd__ = disable
  267. __isub__ = disable
  268. __imul__ = disable
  269. __itruediv__ = disable
  270. __ifloordiv__ = disable
  271. __ipow__ = disable
  272. # Python 2 division operators
  273. if not compat.PY3:
  274. __idiv__ = disable
  275. @property
  276. def values(self):
  277. """
  278. Dense values
  279. """
  280. output = np.empty(len(self), dtype=self.dtype)
  281. int_index = self.sp_index.to_int_index()
  282. output.fill(self.fill_value)
  283. output.put(int_index.indices, self)
  284. return output
  285. @property
  286. def sp_values(self):
  287. # caching not an option, leaks memory
  288. return self.view(np.ndarray)
  289. @property
  290. def fill_value(self):
  291. return self._fill_value
  292. @fill_value.setter
  293. def fill_value(self, value):
  294. if not is_scalar(value):
  295. raise ValueError('fill_value must be a scalar')
  296. # if the specified value triggers type promotion, raise ValueError
  297. new_dtype, fill_value = _maybe_promote(self.dtype, value)
  298. if is_dtype_equal(self.dtype, new_dtype):
  299. self._fill_value = fill_value
  300. else:
  301. msg = 'unable to set fill_value {0} to {1} dtype'
  302. raise ValueError(msg.format(value, self.dtype))
  303. def get_values(self, fill=None):
  304. """ return a dense representation """
  305. return self.to_dense(fill=fill)
  306. def to_dense(self, fill=None):
  307. """
  308. Convert SparseSeries to (dense) Series
  309. """
  310. return self.values
  311. def __iter__(self):
  312. for i in range(len(self)):
  313. yield self._get_val_at(i)
  314. def __getitem__(self, key):
  315. """
  316. """
  317. if is_integer(key):
  318. return self._get_val_at(key)
  319. elif isinstance(key, tuple):
  320. data_slice = self.values[key]
  321. else:
  322. if isinstance(key, SparseArray):
  323. if is_bool_dtype(key):
  324. key = key.to_dense()
  325. else:
  326. key = np.asarray(key)
  327. if hasattr(key, '__len__') and len(self) != len(key):
  328. return self.take(key)
  329. else:
  330. data_slice = self.values[key]
  331. return self._constructor(data_slice)
  332. def __getslice__(self, i, j):
  333. if i < 0:
  334. i = 0
  335. if j < 0:
  336. j = 0
  337. slobj = slice(i, j)
  338. return self.__getitem__(slobj)
  339. def _get_val_at(self, loc):
  340. n = len(self)
  341. if loc < 0:
  342. loc += n
  343. if loc >= n or loc < 0:
  344. raise IndexError('Out of bounds access')
  345. sp_loc = self.sp_index.lookup(loc)
  346. if sp_loc == -1:
  347. return self.fill_value
  348. else:
  349. return _index.get_value_at(self, sp_loc)
  350. @Appender(_index_shared_docs['take'] % _sparray_doc_kwargs)
  351. def take(self, indices, axis=0, allow_fill=True,
  352. fill_value=None, **kwargs):
  353. """
  354. Sparse-compatible version of ndarray.take
  355. Returns
  356. -------
  357. taken : ndarray
  358. """
  359. nv.validate_take(tuple(), kwargs)
  360. if axis:
  361. raise ValueError("axis must be 0, input was {0}".format(axis))
  362. if is_integer(indices):
  363. # return scalar
  364. return self[indices]
  365. indices = _ensure_platform_int(indices)
  366. n = len(self)
  367. if allow_fill and fill_value is not None:
  368. # allow -1 to indicate self.fill_value,
  369. # self.fill_value may not be NaN
  370. if (indices < -1).any():
  371. msg = ('When allow_fill=True and fill_value is not None, '
  372. 'all indices must be >= -1')
  373. raise ValueError(msg)
  374. elif (n <= indices).any():
  375. msg = 'index is out of bounds for size {0}'
  376. raise IndexError(msg.format(n))
  377. else:
  378. if ((indices < -n) | (n <= indices)).any():
  379. msg = 'index is out of bounds for size {0}'
  380. raise IndexError(msg.format(n))
  381. indices = indices.astype(np.int32)
  382. if not (allow_fill and fill_value is not None):
  383. indices = indices.copy()
  384. indices[indices < 0] += n
  385. locs = self.sp_index.lookup_array(indices)
  386. indexer = np.arange(len(locs), dtype=np.int32)
  387. mask = locs != -1
  388. if mask.any():
  389. indexer = indexer[mask]
  390. new_values = self.sp_values.take(locs[mask])
  391. else:
  392. indexer = np.empty(shape=(0, ), dtype=np.int32)
  393. new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)
  394. sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
  395. return self._simple_new(new_values, sp_index, self.fill_value)
  396. def __setitem__(self, key, value):
  397. # if is_integer(key):
  398. # self.values[key] = value
  399. # else:
  400. # raise Exception("SparseArray does not support seting non-scalars
  401. # via setitem")
  402. raise TypeError(
  403. "SparseArray does not support item assignment via setitem")
  404. def __setslice__(self, i, j, value):
  405. if i < 0:
  406. i = 0
  407. if j < 0:
  408. j = 0
  409. slobj = slice(i, j) # noqa
  410. # if not is_scalar(value):
  411. # raise Exception("SparseArray does not support seting non-scalars
  412. # via slices")
  413. # x = self.values
  414. # x[slobj] = value
  415. # self.values = x
  416. raise TypeError("SparseArray does not support item assignment via "
  417. "slices")
  418. def astype(self, dtype=None, copy=True):
  419. dtype = np.dtype(dtype)
  420. sp_values = _astype_nansafe(self.sp_values, dtype, copy=copy)
  421. try:
  422. fill_value = dtype.type(self.fill_value)
  423. except ValueError:
  424. msg = 'unable to coerce current fill_value {0} to {1} dtype'
  425. raise ValueError(msg.format(self.fill_value, dtype))
  426. return self._simple_new(sp_values, self.sp_index,
  427. fill_value=fill_value)
  428. def copy(self, deep=True):
  429. """
  430. Make a copy of the SparseSeries. Only the actual sparse values need to
  431. be copied
  432. """
  433. if deep:
  434. values = self.sp_values.copy()
  435. else:
  436. values = self.sp_values
  437. return SparseArray(values, sparse_index=self.sp_index,
  438. dtype=self.dtype, fill_value=self.fill_value)
  439. def count(self):
  440. """
  441. Compute sum of non-NA/null observations in SparseSeries. If the
  442. fill_value is not NaN, the "sparse" locations will be included in the
  443. observation count
  444. Returns
  445. -------
  446. nobs : int
  447. """
  448. sp_values = self.sp_values
  449. valid_spvals = np.isfinite(sp_values).sum()
  450. if self._null_fill_value:
  451. return valid_spvals
  452. else:
  453. return valid_spvals + self.sp_index.ngaps
  454. @property
  455. def _null_fill_value(self):
  456. return isnull(self.fill_value)
  457. @property
  458. def _valid_sp_values(self):
  459. sp_vals = self.sp_values
  460. mask = notnull(sp_vals)
  461. return sp_vals[mask]
  462. @Appender(_index_shared_docs['fillna'] % _sparray_doc_kwargs)
  463. def fillna(self, value, downcast=None):
  464. if downcast is not None:
  465. raise NotImplementedError
  466. if issubclass(self.dtype.type, np.floating):
  467. value = float(value)
  468. if self._null_fill_value:
  469. return self._simple_new(self.sp_values, self.sp_index,
  470. fill_value=value)
  471. else:
  472. new_values = self.sp_values.copy()
  473. new_values[isnull(new_values)] = value
  474. return self._simple_new(new_values, self.sp_index,
  475. fill_value=self.fill_value)
  476. def sum(self, axis=0, *args, **kwargs):
  477. """
  478. Sum of non-NA/null values
  479. Returns
  480. -------
  481. sum : float
  482. """
  483. nv.validate_sum(args, kwargs)
  484. valid_vals = self._valid_sp_values
  485. sp_sum = valid_vals.sum()
  486. if self._null_fill_value:
  487. return sp_sum
  488. else:
  489. nsparse = self.sp_index.ngaps
  490. return sp_sum + self.fill_value * nsparse
  491. def cumsum(self, axis=0, *args, **kwargs):
  492. """
  493. Cumulative sum of values. Preserves locations of NaN values
  494. Returns
  495. -------
  496. cumsum : Series
  497. """
  498. nv.validate_cumsum(args, kwargs)
  499. # TODO: gh-12855 - return a SparseArray here
  500. if notnull(self.fill_value):
  501. return self.to_dense().cumsum()
  502. # TODO: what if sp_values contains NaN??
  503. return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index,
  504. fill_value=self.fill_value)
  505. def mean(self, axis=0, *args, **kwargs):
  506. """
  507. Mean of non-NA/null values
  508. Returns
  509. -------
  510. mean : float
  511. """
  512. nv.validate_mean(args, kwargs)
  513. valid_vals = self._valid_sp_values
  514. sp_sum = valid_vals.sum()
  515. ct = len(valid_vals)
  516. if self._null_fill_value:
  517. return sp_sum / ct
  518. else:
  519. nsparse = self.sp_index.ngaps
  520. return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
  521. def value_counts(self, dropna=True):
  522. """
  523. Returns a Series containing counts of unique values.
  524. Parameters
  525. ----------
  526. dropna : boolean, default True
  527. Don't include counts of NaN, even if NaN is in sp_values.
  528. Returns
  529. -------
  530. counts : Series
  531. """
  532. keys, counts = algos._value_counts_arraylike(self.sp_values,
  533. dropna=dropna)
  534. fcounts = self.sp_index.ngaps
  535. if fcounts > 0:
  536. if self._null_fill_value and dropna:
  537. pass
  538. else:
  539. if self._null_fill_value:
  540. mask = pd.isnull(keys)
  541. else:
  542. mask = keys == self.fill_value
  543. if mask.any():
  544. counts[mask] += fcounts
  545. else:
  546. keys = np.insert(keys, 0, self.fill_value)
  547. counts = np.insert(counts, 0, fcounts)
  548. if not isinstance(keys, pd.Index):
  549. keys = pd.Index(keys)
  550. result = pd.Series(counts, index=keys)
  551. return result
  552. def _maybe_to_dense(obj):
  553. """ try to convert to dense """
  554. if hasattr(obj, 'to_dense'):
  555. return obj.to_dense()
  556. return obj
  557. def _maybe_to_sparse(array):
  558. """ array must be SparseSeries or SparseArray """
  559. if isinstance(array, ABCSparseSeries):
  560. array = array.values.copy()
  561. return array
  562. def _sanitize_values(arr):
  563. """
  564. return an ndarray for our input,
  565. in a platform independent manner
  566. """
  567. if hasattr(arr, 'values'):
  568. arr = arr.values
  569. else:
  570. # scalar
  571. if is_scalar(arr):
  572. arr = [arr]
  573. # ndarray
  574. if isinstance(arr, np.ndarray):
  575. pass
  576. elif is_list_like(arr) and len(arr) > 0:
  577. arr = _possibly_convert_platform(arr)
  578. else:
  579. arr = np.asarray(arr)
  580. return arr
  581. def make_sparse(arr, kind='block', fill_value=nan):
  582. """
  583. Convert ndarray to sparse format
  584. Parameters
  585. ----------
  586. arr : ndarray
  587. kind : {'block', 'integer'}
  588. fill_value : NaN or another value
  589. Returns
  590. -------
  591. (sparse_values, index) : (ndarray, SparseIndex)
  592. """
  593. arr = _sanitize_values(arr)
  594. if arr.ndim > 1:
  595. raise TypeError("expected dimension <= 1 data")
  596. if isnull(fill_value):
  597. mask = notnull(arr)
  598. else:
  599. mask = arr != fill_value
  600. length = len(arr)
  601. if length != mask.size:
  602. # the arr is a SparseArray
  603. indices = mask.sp_index.indices
  604. else:
  605. indices = np.arange(length, dtype=np.int32)[mask]
  606. index = _make_index(length, indices, kind)
  607. sparsified_values = arr[mask]
  608. return sparsified_values, index
  609. def _make_index(length, indices, kind):
  610. if kind == 'block' or isinstance(kind, BlockIndex):
  611. locs, lens = splib.get_blocks(indices)
  612. index = BlockIndex(length, locs, lens)
  613. elif kind == 'integer' or isinstance(kind, IntIndex):
  614. index = IntIndex(length, indices)
  615. else: # pragma: no cover
  616. raise ValueError('must be block or integer type')
  617. return index
  618. ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,
  619. comp_method=_arith_method,
  620. use_numexpr=False)