PageRenderTime 48ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/sparse/series.py

http://github.com/pydata/pandas
Python | 660 lines | 519 code | 45 blank | 96 comment | 27 complexity | 13a4244683ff834f3b72a208b41525fd MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. Data structures for sparse float data. Life is made simpler by dealing only
  3. with float64 data
  4. """
  5. # pylint: disable=E1101,E1103,W0231
  6. from numpy import nan, ndarray
  7. import numpy as np
  8. import operator
  9. from pandas.core.common import isnull, _values_from_object, _maybe_match_name
  10. from pandas.core.index import Index, _ensure_index
  11. from pandas.core.series import Series
  12. from pandas.core.frame import DataFrame
  13. from pandas.core.internals import SingleBlockManager
  14. from pandas.core import generic
  15. import pandas.core.common as com
  16. import pandas.core.ops as ops
  17. import pandas.core.datetools as datetools
  18. import pandas.index as _index
  19. from pandas import compat
  20. from pandas.sparse.array import (make_sparse, _sparse_array_op, SparseArray)
  21. from pandas._sparse import BlockIndex, IntIndex
  22. import pandas._sparse as splib
  23. from pandas.util.decorators import Appender
  24. #------------------------------------------------------------------------------
  25. # Wrapper function for Series arithmetic methods
  26. def _arith_method(op, name, str_rep=None, default_axis=None, fill_zeros=None,
  27. **eval_kwargs):
  28. """
  29. Wrapper function for Series arithmetic operations, to avoid
  30. code duplication.
  31. str_rep, default_axis, fill_zeros and eval_kwargs are not used, but are present
  32. for compatibility.
  33. """
  34. def wrapper(self, other):
  35. if isinstance(other, Series):
  36. if not isinstance(other, SparseSeries):
  37. other = other.to_sparse(fill_value=self.fill_value)
  38. return _sparse_series_op(self, other, op, name)
  39. elif isinstance(other, DataFrame):
  40. return NotImplemented
  41. elif np.isscalar(other):
  42. if isnull(other) or isnull(self.fill_value):
  43. new_fill_value = np.nan
  44. else:
  45. new_fill_value = op(np.float64(self.fill_value),
  46. np.float64(other))
  47. return SparseSeries(op(self.sp_values, other),
  48. index=self.index,
  49. sparse_index=self.sp_index,
  50. fill_value=new_fill_value,
  51. name=self.name)
  52. else: # pragma: no cover
  53. raise TypeError('operation with %s not supported' % type(other))
  54. wrapper.__name__ = name
  55. if name.startswith("__"):
  56. # strip special method names, e.g. `__add__` needs to be `add` when passed
  57. # to _sparse_series_op
  58. name = name[2:-2]
  59. return wrapper
  60. def _sparse_series_op(left, right, op, name):
  61. left, right = left.align(right, join='outer', copy=False)
  62. new_index = left.index
  63. new_name = _maybe_match_name(left, right)
  64. result = _sparse_array_op(left, right, op, name)
  65. return SparseSeries(result, index=new_index, name=new_name)
  66. class SparseSeries(Series):
  67. """Data structure for labeled, sparse floating point data
  68. Parameters
  69. ----------
  70. data : {array-like, Series, SparseSeries, dict}
  71. kind : {'block', 'integer'}
  72. fill_value : float
  73. Defaults to NaN (code for missing)
  74. sparse_index : {BlockIndex, IntIndex}, optional
  75. Only if you have one. Mainly used internally
  76. Notes
  77. -----
  78. SparseSeries objects are immutable via the typical Python means. If you
  79. must change values, convert to dense, make your changes, then convert back
  80. to sparse
  81. """
  82. _subtyp = 'sparse_series'
  83. def __init__(self, data, index=None, sparse_index=None, kind='block',
  84. fill_value=None, name=None, dtype=None, copy=False,
  85. fastpath=False):
  86. # we are called internally, so short-circuit
  87. if fastpath:
  88. # data is an ndarray, index is defined
  89. data = SingleBlockManager(data, index, fastpath=True)
  90. if copy:
  91. data = data.copy()
  92. else:
  93. is_sparse_array = isinstance(data, SparseArray)
  94. if fill_value is None:
  95. if is_sparse_array:
  96. fill_value = data.fill_value
  97. else:
  98. fill_value = nan
  99. if is_sparse_array:
  100. if isinstance(data, SparseSeries) and index is None:
  101. index = data.index.view()
  102. elif index is not None:
  103. assert(len(index) == len(data))
  104. sparse_index = data.sp_index
  105. data = np.asarray(data)
  106. elif isinstance(data, SparseSeries):
  107. if index is None:
  108. index = data.index.view()
  109. # extract the SingleBlockManager
  110. data = data._data
  111. elif isinstance(data, (Series, dict)):
  112. if index is None:
  113. index = data.index.view()
  114. data = Series(data)
  115. data, sparse_index = make_sparse(data, kind=kind,
  116. fill_value=fill_value)
  117. elif isinstance(data, (tuple, list, np.ndarray)):
  118. # array-like
  119. if sparse_index is None:
  120. data, sparse_index = make_sparse(data, kind=kind,
  121. fill_value=fill_value)
  122. else:
  123. assert(len(data) == sparse_index.npoints)
  124. elif isinstance(data, SingleBlockManager):
  125. if dtype is not None:
  126. data = data.astype(dtype)
  127. if index is None:
  128. index = data.index.view()
  129. else:
  130. data = data.reindex(index, copy=False)
  131. else:
  132. length = len(index)
  133. if data == fill_value or (isnull(data)
  134. and isnull(fill_value)):
  135. if kind == 'block':
  136. sparse_index = BlockIndex(length, [], [])
  137. else:
  138. sparse_index = IntIndex(length, [])
  139. data = np.array([])
  140. else:
  141. if kind == 'block':
  142. locs, lens = ([0], [length]) if length else ([], [])
  143. sparse_index = BlockIndex(length, locs, lens)
  144. else:
  145. sparse_index = IntIndex(length, index)
  146. v = data
  147. data = np.empty(length)
  148. data.fill(v)
  149. if index is None:
  150. index = com._default_index(sparse_index.length)
  151. index = _ensure_index(index)
  152. # create/copy the manager
  153. if isinstance(data, SingleBlockManager):
  154. if copy:
  155. data = data.copy()
  156. else:
  157. # create a sparse array
  158. if not isinstance(data, SparseArray):
  159. data = SparseArray(
  160. data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy)
  161. data = SingleBlockManager(data, index)
  162. generic.NDFrame.__init__(self, data)
  163. self.index = index
  164. self.name = name
  165. @property
  166. def values(self):
  167. """ return the array """
  168. return self._data._values
  169. def get_values(self):
  170. """ same as values """
  171. return self._data._values.to_dense().view()
  172. @property
  173. def block(self):
  174. return self._data._block
  175. @property
  176. def fill_value(self):
  177. return self.block.fill_value
  178. @fill_value.setter
  179. def fill_value(self, v):
  180. self.block.fill_value = v
  181. @property
  182. def sp_index(self):
  183. return self.block.sp_index
  184. @property
  185. def sp_values(self):
  186. return self.values.sp_values
  187. @property
  188. def npoints(self):
  189. return self.sp_index.npoints
  190. @classmethod
  191. def from_array(cls, arr, index=None, name=None, copy=False, fill_value=None, fastpath=False):
  192. """
  193. Simplified alternate constructor
  194. """
  195. return cls(arr, index=index, name=name, copy=copy, fill_value=fill_value, fastpath=fastpath)
  196. @property
  197. def _constructor(self):
  198. return SparseSeries
  199. @property
  200. def kind(self):
  201. if isinstance(self.sp_index, BlockIndex):
  202. return 'block'
  203. elif isinstance(self.sp_index, IntIndex):
  204. return 'integer'
  205. def as_sparse_array(self, kind=None, fill_value=None, copy=False):
  206. """ return my self as a sparse array, do not copy by default """
  207. if fill_value is None:
  208. fill_value = self.fill_value
  209. if kind is None:
  210. kind = self.kind
  211. return SparseArray(self.values,
  212. sparse_index=self.sp_index,
  213. fill_value=fill_value,
  214. kind=kind,
  215. copy=copy)
  216. def __len__(self):
  217. return len(self.block)
  218. def __unicode__(self):
  219. # currently, unicode is same as repr...fixes infinite loop
  220. series_rep = Series.__unicode__(self)
  221. rep = '%s\n%s' % (series_rep, repr(self.sp_index))
  222. return rep
  223. def __array_wrap__(self, result):
  224. """
  225. Gets called prior to a ufunc (and after)
  226. """
  227. return self._constructor(result,
  228. index=self.index,
  229. sparse_index=self.sp_index,
  230. fill_value=self.fill_value,
  231. copy=False).__finalize__(self)
  232. def __array_finalize__(self, obj):
  233. """
  234. Gets called after any ufunc or other array operations, necessary
  235. to pass on the index.
  236. """
  237. self.name = getattr(obj, 'name', None)
  238. self.fill_value = getattr(obj, 'fill_value', None)
  239. def __getstate__(self):
  240. # pickling
  241. return dict(_typ=self._typ,
  242. _subtyp=self._subtyp,
  243. _data=self._data,
  244. fill_value=self.fill_value,
  245. name=self.name)
  246. def _unpickle_series_compat(self, state):
  247. nd_state, own_state = state
  248. # recreate the ndarray
  249. data = np.empty(nd_state[1], dtype=nd_state[2])
  250. np.ndarray.__setstate__(data, nd_state)
  251. index, fill_value, sp_index = own_state[:3]
  252. name = None
  253. if len(own_state) > 3:
  254. name = own_state[3]
  255. # create a sparse array
  256. if not isinstance(data, SparseArray):
  257. data = SparseArray(
  258. data, sparse_index=sp_index, fill_value=fill_value, copy=False)
  259. # recreate
  260. data = SingleBlockManager(data, index, fastpath=True)
  261. generic.NDFrame.__init__(self, data)
  262. self._set_axis(0, index)
  263. self.name = name
  264. def __iter__(self):
  265. """ forward to the array """
  266. return iter(self.values)
  267. def _set_subtyp(self, is_all_dates):
  268. if is_all_dates:
  269. object.__setattr__(self, '_subtyp', 'sparse_time_series')
  270. else:
  271. object.__setattr__(self, '_subtyp', 'sparse_series')
  272. def _get_val_at(self, loc):
  273. """ forward to the array """
  274. return self.block.values._get_val_at(loc)
  275. def __getitem__(self, key):
  276. """
  277. """
  278. try:
  279. return self._get_val_at(self.index.get_loc(key))
  280. except KeyError:
  281. if isinstance(key, (int, np.integer)):
  282. return self._get_val_at(key)
  283. raise Exception('Requested index not in this series!')
  284. except TypeError:
  285. # Could not hash item, must be array-like?
  286. pass
  287. # is there a case where this would NOT be an ndarray?
  288. # need to find an example, I took out the case for now
  289. key = _values_from_object(key)
  290. dataSlice = self.values[key]
  291. new_index = Index(self.index.view(ndarray)[key])
  292. return self._constructor(dataSlice, index=new_index).__finalize__(self)
  293. def _set_with_engine(self, key, value):
  294. return self.set_value(key, value)
  295. def abs(self):
  296. """
  297. Return an object with absolute value taken. Only applicable to objects
  298. that are all numeric
  299. Returns
  300. -------
  301. abs: type of caller
  302. """
  303. res_sp_values = np.abs(self.sp_values)
  304. return self._constructor(res_sp_values, index=self.index,
  305. sparse_index=self.sp_index,
  306. fill_value=self.fill_value)
  307. def get(self, label, default=None):
  308. """
  309. Returns value occupying requested label, default to specified
  310. missing value if not present. Analogous to dict.get
  311. Parameters
  312. ----------
  313. label : object
  314. Label value looking for
  315. default : object, optional
  316. Value to return if label not in index
  317. Returns
  318. -------
  319. y : scalar
  320. """
  321. if label in self.index:
  322. loc = self.index.get_loc(label)
  323. return self._get_val_at(loc)
  324. else:
  325. return default
  326. def get_value(self, label, takeable=False):
  327. """
  328. Retrieve single value at passed index label
  329. Parameters
  330. ----------
  331. index : label
  332. takeable : interpret the index as indexers, default False
  333. Returns
  334. -------
  335. value : scalar value
  336. """
  337. loc = label if takeable is True else self.index.get_loc(label)
  338. return self._get_val_at(loc)
  339. def set_value(self, label, value, takeable=False):
  340. """
  341. Quickly set single value at passed label. If label is not contained, a
  342. new object is created with the label placed at the end of the result
  343. index
  344. Parameters
  345. ----------
  346. label : object
  347. Partial indexing with MultiIndex not allowed
  348. value : object
  349. Scalar value
  350. takeable : interpret the index as indexers, default False
  351. Notes
  352. -----
  353. This method *always* returns a new object. It is not particularly
  354. efficient but is provided for API compatibility with Series
  355. Returns
  356. -------
  357. series : SparseSeries
  358. """
  359. values = self.to_dense()
  360. # if the label doesn't exist, we will create a new object here
  361. # and possibily change the index
  362. new_values = values.set_value(label, value, takeable=takeable)
  363. if new_values is not None:
  364. values = new_values
  365. new_index = values.index
  366. values = SparseArray(
  367. values, fill_value=self.fill_value, kind=self.kind)
  368. self._data = SingleBlockManager(values, new_index)
  369. self._index = new_index
  370. def _set_values(self, key, value):
  371. # this might be inefficient as we have to recreate the sparse array
  372. # rather than setting individual elements, but have to convert
  373. # the passed slice/boolean that's in dense space into a sparse indexer
  374. # not sure how to do that!
  375. if isinstance(key, Series):
  376. key = key.values
  377. values = self.values.to_dense()
  378. values[key] = _index.convert_scalar(values, value)
  379. values = SparseArray(
  380. values, fill_value=self.fill_value, kind=self.kind)
  381. self._data = SingleBlockManager(values, self.index)
  382. def to_dense(self, sparse_only=False):
  383. """
  384. Convert SparseSeries to (dense) Series
  385. """
  386. if sparse_only:
  387. int_index = self.sp_index.to_int_index()
  388. index = self.index.take(int_index.indices)
  389. return Series(self.sp_values, index=index, name=self.name)
  390. else:
  391. return Series(self.values.to_dense(), index=self.index, name=self.name)
  392. @property
  393. def density(self):
  394. r = float(self.sp_index.npoints) / float(self.sp_index.length)
  395. return r
  396. def copy(self, deep=True):
  397. """
  398. Make a copy of the SparseSeries. Only the actual sparse values need to
  399. be copied
  400. """
  401. new_data = self._data
  402. if deep:
  403. new_data = self._data.copy()
  404. return self._constructor(new_data,
  405. sparse_index=self.sp_index,
  406. fill_value=self.fill_value).__finalize__(self)
  407. def reindex(self, index=None, method=None, copy=True, limit=None):
  408. """
  409. Conform SparseSeries to new Index
  410. See Series.reindex docstring for general behavior
  411. Returns
  412. -------
  413. reindexed : SparseSeries
  414. """
  415. new_index = _ensure_index(index)
  416. if self.index.equals(new_index):
  417. if copy:
  418. return self.copy()
  419. else:
  420. return self
  421. return self._constructor(self._data.reindex(new_index, method=method, limit=limit, copy=copy),
  422. index=new_index).__finalize__(self)
  423. def sparse_reindex(self, new_index):
  424. """
  425. Conform sparse values to new SparseIndex
  426. Parameters
  427. ----------
  428. new_index : {BlockIndex, IntIndex}
  429. Returns
  430. -------
  431. reindexed : SparseSeries
  432. """
  433. if not isinstance(new_index, splib.SparseIndex):
  434. raise TypeError('new index must be a SparseIndex')
  435. block = self.block.sparse_reindex(new_index)
  436. new_data = SingleBlockManager(block, self.index)
  437. return self._constructor(new_data, index=self.index,
  438. sparse_index=new_index,
  439. fill_value=self.fill_value).__finalize__(self)
  440. def take(self, indices, axis=0, convert=True):
  441. """
  442. Sparse-compatible version of ndarray.take
  443. Returns
  444. -------
  445. taken : ndarray
  446. """
  447. new_values = SparseArray.take(self.values, indices)
  448. new_index = self.index.take(indices)
  449. return self._constructor(new_values, index=new_index).__finalize__(self)
  450. def cumsum(self, axis=0, dtype=None, out=None):
  451. """
  452. Cumulative sum of values. Preserves locations of NaN values
  453. Returns
  454. -------
  455. cumsum : Series or SparseSeries
  456. """
  457. new_array = SparseArray.cumsum(self.values)
  458. if isinstance(new_array, SparseArray):
  459. return self._constructor(new_array, index=self.index, sparse_index=new_array.sp_index).__finalize__(self)
  460. return Series(new_array, index=self.index).__finalize__(self)
  461. def dropna(self, axis=0, inplace=False, **kwargs):
  462. """
  463. Analogous to Series.dropna. If fill_value=NaN, returns a dense Series
  464. """
  465. # TODO: make more efficient
  466. axis = self._get_axis_number(axis or 0)
  467. dense_valid = self.to_dense().valid()
  468. if inplace:
  469. raise NotImplementedError("Cannot perform inplace dropna"
  470. " operations on a SparseSeries")
  471. if isnull(self.fill_value):
  472. return dense_valid
  473. else:
  474. dense_valid = dense_valid[dense_valid != self.fill_value]
  475. return dense_valid.to_sparse(fill_value=self.fill_value)
  476. def shift(self, periods, freq=None, **kwds):
  477. """
  478. Analogous to Series.shift
  479. """
  480. from pandas.core.datetools import _resolve_offset
  481. offset = _resolve_offset(freq, kwds)
  482. # no special handling of fill values yet
  483. if not isnull(self.fill_value):
  484. dense_shifted = self.to_dense().shift(periods, freq=freq,
  485. **kwds)
  486. return dense_shifted.to_sparse(fill_value=self.fill_value,
  487. kind=self.kind)
  488. if periods == 0:
  489. return self.copy()
  490. if offset is not None:
  491. return self._constructor(self.sp_values,
  492. sparse_index=self.sp_index,
  493. index=self.index.shift(periods, offset),
  494. fill_value=self.fill_value).__finalize__(self)
  495. int_index = self.sp_index.to_int_index()
  496. new_indices = int_index.indices + periods
  497. start, end = new_indices.searchsorted([0, int_index.length])
  498. new_indices = new_indices[start:end]
  499. new_sp_index = IntIndex(len(self), new_indices)
  500. if isinstance(self.sp_index, BlockIndex):
  501. new_sp_index = new_sp_index.to_block_index()
  502. return self._constructor(self.sp_values[start:end].copy(),
  503. index=self.index,
  504. sparse_index=new_sp_index,
  505. fill_value=self.fill_value).__finalize__(self)
  506. def combine_first(self, other):
  507. """
  508. Combine Series values, choosing the calling Series's values
  509. first. Result index will be the union of the two indexes
  510. Parameters
  511. ----------
  512. other : Series
  513. Returns
  514. -------
  515. y : Series
  516. """
  517. if isinstance(other, SparseSeries):
  518. other = other.to_dense()
  519. dense_combined = self.to_dense().combine_first(other)
  520. return dense_combined.to_sparse(fill_value=self.fill_value)
  521. # overwrite series methods with unaccelerated versions
  522. ops.add_special_arithmetic_methods(SparseSeries, use_numexpr=False,
  523. **ops.series_special_funcs)
  524. ops.add_flex_arithmetic_methods(SparseSeries, use_numexpr=False,
  525. **ops.series_flex_funcs)
  526. # overwrite basic arithmetic to use SparseSeries version
  527. # force methods to overwrite previous definitions.
  528. ops.add_special_arithmetic_methods(SparseSeries, _arith_method,
  529. radd_func=operator.add, comp_method=None,
  530. bool_method=None, use_numexpr=False, force=True)
  531. # backwards compatiblity
  532. SparseTimeSeries = SparseSeries