PageRenderTime 51ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/sparse/frame.py

http://github.com/pydata/pandas
Python | 832 lines | 665 code | 85 blank | 82 comment | 88 complexity | 298bab4d3c57082de3a09055447c25b6 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. Data structures for sparse float data. Life is made simpler by dealing only
  3. with float64 data
  4. """
  5. from __future__ import division
  6. # pylint: disable=E1101,E1103,W0231,E0202
  7. from numpy import nan
  8. from pandas.compat import range, lmap, map
  9. from pandas import compat
  10. import numpy as np
  11. from pandas.core.common import (isnull, notnull, _pickle_array,
  12. _unpickle_array, _try_sort)
  13. from pandas.core.index import Index, MultiIndex, _ensure_index
  14. from pandas.core.indexing import _maybe_convert_indices
  15. from pandas.core.series import Series
  16. from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray,
  17. _default_index)
  18. from pandas.util.decorators import cache_readonly
  19. import pandas.core.common as com
  20. import pandas.core.datetools as datetools
  21. from pandas.core.internals import BlockManager, create_block_manager_from_arrays
  22. from pandas.core.generic import NDFrame
  23. from pandas.sparse.series import SparseSeries, SparseArray
  24. from pandas.util.decorators import Appender
  25. import pandas.core.ops as ops
  26. class SparseDataFrame(DataFrame):
  27. """
  28. DataFrame containing sparse floating point data in the form of SparseSeries
  29. objects
  30. Parameters
  31. ----------
  32. data : same types as can be passed to DataFrame
  33. index : array-like, optional
  34. column : array-like, optional
  35. default_kind : {'block', 'integer'}, default 'block'
  36. Default sparse kind for converting Series to SparseSeries. Will not
  37. override SparseSeries passed into constructor
  38. default_fill_value : float
  39. Default fill_value for converting Series to SparseSeries. Will not
  40. override SparseSeries passed in
  41. """
  42. _constructor_sliced = SparseSeries
  43. _subtyp = 'sparse_frame'
  44. def __init__(self, data=None, index=None, columns=None,
  45. default_kind=None, default_fill_value=None,
  46. dtype=None, copy=False):
  47. # pick up the defaults from the Sparse structures
  48. if isinstance(data, SparseDataFrame):
  49. if index is None:
  50. index = data.index
  51. if columns is None:
  52. columns = data.columns
  53. if default_fill_value is None:
  54. default_fill_value = data.default_fill_value
  55. if default_kind is None:
  56. default_kind = data.default_kind
  57. elif isinstance(data, (SparseSeries, SparseArray)):
  58. if index is None:
  59. index = data.index
  60. if default_fill_value is None:
  61. default_fill_value = data.fill_value
  62. if columns is None and hasattr(data, 'name'):
  63. columns = [data.name]
  64. if columns is None:
  65. raise Exception("cannot pass a series w/o a name or columns")
  66. data = {columns[0]: data}
  67. if default_fill_value is None:
  68. default_fill_value = np.nan
  69. if default_kind is None:
  70. default_kind = 'block'
  71. self._default_kind = default_kind
  72. self._default_fill_value = default_fill_value
  73. if isinstance(data, dict):
  74. mgr = self._init_dict(data, index, columns)
  75. if dtype is not None:
  76. mgr = mgr.astype(dtype)
  77. elif isinstance(data, (np.ndarray, list)):
  78. mgr = self._init_matrix(data, index, columns)
  79. if dtype is not None:
  80. mgr = mgr.astype(dtype)
  81. elif isinstance(data, SparseDataFrame):
  82. mgr = self._init_mgr(
  83. data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy)
  84. elif isinstance(data, DataFrame):
  85. mgr = self._init_dict(data, data.index, data.columns)
  86. if dtype is not None:
  87. mgr = mgr.astype(dtype)
  88. elif isinstance(data, BlockManager):
  89. mgr = self._init_mgr(
  90. data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy)
  91. elif data is None:
  92. data = {}
  93. if index is None:
  94. index = Index([])
  95. else:
  96. index = _ensure_index(index)
  97. if columns is None:
  98. columns = Index([])
  99. else:
  100. for c in columns:
  101. data[c] = SparseArray(np.nan,
  102. index=index,
  103. kind=self._default_kind,
  104. fill_value=self._default_fill_value)
  105. mgr = dict_to_manager(data, columns, index)
  106. if dtype is not None:
  107. mgr = mgr.astype(dtype)
  108. NDFrame.__init__(self, mgr)
  109. @property
  110. def _constructor(self):
  111. def wrapper(data=None, index=None, columns=None, default_fill_value=None, kind=None, fill_value=None, copy=False):
  112. result = SparseDataFrame(data, index=index, columns=columns,
  113. default_fill_value=fill_value,
  114. default_kind=kind,
  115. copy=copy)
  116. # fill if requested
  117. if fill_value is not None and not isnull(fill_value):
  118. result.fillna(fill_value, inplace=True)
  119. # set the default_fill_value
  120. # if default_fill_value is not None:
  121. # result._default_fill_value = default_fill_value
  122. return result
  123. return wrapper
  124. def _init_dict(self, data, index, columns, dtype=None):
  125. # pre-filter out columns if we passed it
  126. if columns is not None:
  127. columns = _ensure_index(columns)
  128. data = dict((k, v) for k, v in compat.iteritems(data) if k in columns)
  129. else:
  130. columns = Index(_try_sort(list(data.keys())))
  131. if index is None:
  132. index = extract_index(list(data.values()))
  133. sp_maker = lambda x: SparseArray(x,
  134. kind=self._default_kind,
  135. fill_value=self._default_fill_value,
  136. copy=True)
  137. sdict = {}
  138. for k, v in compat.iteritems(data):
  139. if isinstance(v, Series):
  140. # Force alignment, no copy necessary
  141. if not v.index.equals(index):
  142. v = v.reindex(index)
  143. if not isinstance(v, SparseSeries):
  144. v = sp_maker(v.values)
  145. elif isinstance(v, SparseArray):
  146. v = sp_maker(v.values)
  147. else:
  148. if isinstance(v, dict):
  149. v = [v.get(i, nan) for i in index]
  150. v = sp_maker(v)
  151. sdict[k] = v
  152. # TODO: figure out how to handle this case, all nan's?
  153. # add in any other columns we want to have (completeness)
  154. nan_vec = np.empty(len(index))
  155. nan_vec.fill(nan)
  156. for c in columns:
  157. if c not in sdict:
  158. sdict[c] = sp_maker(nan_vec)
  159. return dict_to_manager(sdict, columns, index)
  160. def _init_matrix(self, data, index, columns, dtype=None):
  161. data = _prep_ndarray(data, copy=False)
  162. N, K = data.shape
  163. if index is None:
  164. index = _default_index(N)
  165. if columns is None:
  166. columns = _default_index(K)
  167. if len(columns) != K:
  168. raise ValueError('Column length mismatch: %d vs. %d' %
  169. (len(columns), K))
  170. if len(index) != N:
  171. raise ValueError('Index length mismatch: %d vs. %d' %
  172. (len(index), N))
  173. data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
  174. return self._init_dict(data, index, columns, dtype)
  175. def __array_wrap__(self, result):
  176. return SparseDataFrame(result, index=self.index, columns=self.columns,
  177. default_kind=self._default_kind,
  178. default_fill_value=self._default_fill_value).__finalize__(self)
  179. def __getstate__(self):
  180. # pickling
  181. return dict(_typ=self._typ,
  182. _subtyp=self._subtyp,
  183. _data=self._data,
  184. _default_fill_value=self._default_fill_value,
  185. _default_kind=self._default_kind)
  186. def _unpickle_sparse_frame_compat(self, state):
  187. """ original pickle format """
  188. series, cols, idx, fv, kind = state
  189. if not isinstance(cols, Index): # pragma: no cover
  190. columns = _unpickle_array(cols)
  191. else:
  192. columns = cols
  193. if not isinstance(idx, Index): # pragma: no cover
  194. index = _unpickle_array(idx)
  195. else:
  196. index = idx
  197. series_dict = {}
  198. for col, (sp_index, sp_values) in compat.iteritems(series):
  199. series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index,
  200. fill_value=fv)
  201. self._data = dict_to_manager(series_dict, columns, index)
  202. self._default_fill_value = fv
  203. self._default_kind = kind
  204. def to_dense(self):
  205. """
  206. Convert to dense DataFrame
  207. Returns
  208. -------
  209. df : DataFrame
  210. """
  211. data = dict((k, v.to_dense()) for k, v in compat.iteritems(self))
  212. return DataFrame(data, index=self.index)
  213. def astype(self, dtype):
  214. raise NotImplementedError
  215. def copy(self, deep=True):
  216. """
  217. Make a copy of this SparseDataFrame
  218. """
  219. result = super(SparseDataFrame, self).copy(deep=deep)
  220. result._default_fill_value = self._default_fill_value
  221. result._default_kind = self._default_kind
  222. return result
  223. @property
  224. def default_fill_value(self):
  225. return self._default_fill_value
  226. @property
  227. def default_kind(self):
  228. return self._default_kind
  229. @property
  230. def density(self):
  231. """
  232. Ratio of non-sparse points to total (dense) data points
  233. represented in the frame
  234. """
  235. tot_nonsparse = sum([ser.sp_index.npoints
  236. for _, ser in compat.iteritems(self)])
  237. tot = len(self.index) * len(self.columns)
  238. return tot_nonsparse / float(tot)
  239. def fillna(self, value=None, method=None, axis=0, inplace=False,
  240. limit=None, downcast=None):
  241. new_self = super(
  242. SparseDataFrame, self).fillna(value=value, method=method, axis=axis,
  243. inplace=inplace, limit=limit, downcast=downcast)
  244. if not inplace:
  245. self = new_self
  246. # set the fill value if we are filling as a scalar with nothing special
  247. # going on
  248. if value is not None and value == value and method is None and limit is None:
  249. self._default_fill_value = value
  250. if not inplace:
  251. return self
  252. #----------------------------------------------------------------------
  253. # Support different internal representation of SparseDataFrame
  254. def _sanitize_column(self, key, value):
  255. sp_maker = lambda x, index=None: SparseArray(x,
  256. index=index,
  257. fill_value=self._default_fill_value,
  258. kind=self._default_kind)
  259. if isinstance(value, SparseSeries):
  260. clean = value.reindex(
  261. self.index).as_sparse_array(fill_value=self._default_fill_value,
  262. kind=self._default_kind)
  263. elif isinstance(value, SparseArray):
  264. if len(value) != len(self.index):
  265. raise AssertionError('Length of values does not match '
  266. 'length of index')
  267. clean = value
  268. elif hasattr(value, '__iter__'):
  269. if isinstance(value, Series):
  270. clean = value.reindex(self.index)
  271. if not isinstance(value, SparseSeries):
  272. clean = sp_maker(clean)
  273. else:
  274. if len(value) != len(self.index):
  275. raise AssertionError('Length of values does not match '
  276. 'length of index')
  277. clean = sp_maker(value)
  278. # Scalar
  279. else:
  280. clean = sp_maker(value, self.index)
  281. # always return a SparseArray!
  282. return clean
  283. def __getitem__(self, key):
  284. """
  285. Retrieve column or slice from DataFrame
  286. """
  287. if isinstance(key, slice):
  288. date_rng = self.index[key]
  289. return self.reindex(date_rng)
  290. elif isinstance(key, (np.ndarray, list, Series)):
  291. return self._getitem_array(key)
  292. else:
  293. return self._get_item_cache(key)
  294. @Appender(DataFrame.get_value.__doc__, indents=0)
  295. def get_value(self, index, col, takeable=False):
  296. if takeable is True:
  297. series = self._iget_item_cache(col)
  298. else:
  299. series = self._get_item_cache(col)
  300. return series.get_value(index, takeable=takeable)
  301. def set_value(self, index, col, value, takeable=False):
  302. """
  303. Put single value at passed column and index
  304. Parameters
  305. ----------
  306. index : row label
  307. col : column label
  308. value : scalar value
  309. takeable : interpret the index/col as indexers, default False
  310. Notes
  311. -----
  312. This method *always* returns a new object. It is currently not
  313. particularly efficient (and potentially very expensive) but is provided
  314. for API compatibility with DataFrame
  315. Returns
  316. -------
  317. frame : DataFrame
  318. """
  319. dense = self.to_dense().set_value(index, col, value, takeable=takeable)
  320. return dense.to_sparse(kind=self._default_kind,
  321. fill_value=self._default_fill_value)
  322. def _slice(self, slobj, axis=0, typ=None):
  323. if axis == 0:
  324. new_index = self.index[slobj]
  325. new_columns = self.columns
  326. else:
  327. new_index = self.index
  328. new_columns = self.columns[slobj]
  329. return self.reindex(index=new_index, columns=new_columns)
  330. def xs(self, key, axis=0, copy=False):
  331. """
  332. Returns a row (cross-section) from the SparseDataFrame as a Series
  333. object.
  334. Parameters
  335. ----------
  336. key : some index contained in the index
  337. Returns
  338. -------
  339. xs : Series
  340. """
  341. if axis == 1:
  342. data = self[key]
  343. return data
  344. i = self.index.get_loc(key)
  345. data = self.take([i]).get_values()[0]
  346. return Series(data, index=self.columns)
  347. #----------------------------------------------------------------------
  348. # Arithmetic-related methods
  349. def _combine_frame(self, other, func, fill_value=None, level=None):
  350. this, other = self.align(other, join='outer', level=level,
  351. copy=False)
  352. new_index, new_columns = this.index, this.columns
  353. if level is not None:
  354. raise NotImplementedError
  355. if self.empty and other.empty:
  356. return SparseDataFrame(index=new_index).__finalize__(self)
  357. new_data = {}
  358. new_fill_value = None
  359. if fill_value is not None:
  360. # TODO: be a bit more intelligent here
  361. for col in new_columns:
  362. if col in this and col in other:
  363. dleft = this[col].to_dense()
  364. dright = other[col].to_dense()
  365. result = dleft._binop(dright, func, fill_value=fill_value)
  366. result = result.to_sparse(fill_value=this[col].fill_value)
  367. new_data[col] = result
  368. else:
  369. for col in new_columns:
  370. if col in this and col in other:
  371. new_data[col] = func(this[col], other[col])
  372. # if the fill values are the same use them? or use a valid one
  373. other_fill_value = getattr(other, 'default_fill_value', np.nan)
  374. if self.default_fill_value == other_fill_value:
  375. new_fill_value = self.default_fill_value
  376. elif np.isnan(self.default_fill_value) and not np.isnan(other_fill_value):
  377. new_fill_value = other_fill_value
  378. elif not np.isnan(self.default_fill_value) and np.isnan(other_fill_value):
  379. new_fill_value = self.default_fill_value
  380. return self._constructor(data=new_data,
  381. index=new_index,
  382. columns=new_columns,
  383. default_fill_value=new_fill_value,
  384. fill_value=new_fill_value).__finalize__(self)
  385. def _combine_match_index(self, other, func, level=None, fill_value=None):
  386. new_data = {}
  387. if fill_value is not None:
  388. raise NotImplementedError
  389. if level is not None:
  390. raise NotImplementedError
  391. new_index = self.index.union(other.index)
  392. this = self
  393. if self.index is not new_index:
  394. this = self.reindex(new_index)
  395. if other.index is not new_index:
  396. other = other.reindex(new_index)
  397. for col, series in compat.iteritems(this):
  398. new_data[col] = func(series.values, other.values)
  399. # fill_value is a function of our operator
  400. if isnull(other.fill_value) or isnull(self.default_fill_value):
  401. fill_value = np.nan
  402. else:
  403. fill_value = func(np.float64(self.default_fill_value),
  404. np.float64(other.fill_value))
  405. return self._constructor(new_data,
  406. index=new_index,
  407. columns=self.columns,
  408. default_fill_value=fill_value,
  409. fill_value=self.default_fill_value).__finalize__(self)
  410. def _combine_match_columns(self, other, func, level=None, fill_value=None):
  411. # patched version of DataFrame._combine_match_columns to account for
  412. # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series,
  413. # where 3.0 is numpy.float64 and series is a SparseSeries. Still
  414. # possible for this to happen, which is bothersome
  415. if fill_value is not None:
  416. raise NotImplementedError
  417. if level is not None:
  418. raise NotImplementedError
  419. new_data = {}
  420. union = intersection = self.columns
  421. if not union.equals(other.index):
  422. union = other.index.union(self.columns)
  423. intersection = other.index.intersection(self.columns)
  424. for col in intersection:
  425. new_data[col] = func(self[col], float(other[col]))
  426. return self._constructor(new_data,
  427. index=self.index,
  428. columns=union,
  429. default_fill_value=self.default_fill_value,
  430. fill_value=self.default_fill_value).__finalize__(self)
  431. def _combine_const(self, other, func):
  432. new_data = {}
  433. for col, series in compat.iteritems(self):
  434. new_data[col] = func(series, other)
  435. return self._constructor(data=new_data,
  436. index=self.index,
  437. columns=self.columns,
  438. default_fill_value=self.default_fill_value,
  439. fill_value=self.default_fill_value).__finalize__(self)
  440. def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
  441. limit=None, takeable=False):
  442. if level is not None:
  443. raise TypeError('Reindex by level not supported for sparse')
  444. if self.index.equals(index):
  445. if copy:
  446. return self.copy()
  447. else:
  448. return self
  449. if len(self.index) == 0:
  450. return SparseDataFrame(index=index, columns=self.columns)
  451. indexer = self.index.get_indexer(index, method, limit=limit)
  452. indexer = com._ensure_platform_int(indexer)
  453. mask = indexer == -1
  454. need_mask = mask.any()
  455. new_series = {}
  456. for col, series in self.iteritems():
  457. if mask.all():
  458. continue
  459. values = series.values
  460. new = values.take(indexer)
  461. if need_mask:
  462. np.putmask(new, mask, fill_value)
  463. new_series[col] = new
  464. return SparseDataFrame(new_series, index=index, columns=self.columns,
  465. default_fill_value=self._default_fill_value)
  466. def _reindex_columns(self, columns, copy, level, fill_value, limit=None,
  467. takeable=False):
  468. if level is not None:
  469. raise TypeError('Reindex by level not supported for sparse')
  470. if com.notnull(fill_value):
  471. raise NotImplementedError
  472. if limit:
  473. raise NotImplementedError
  474. # TODO: fill value handling
  475. sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns)
  476. return SparseDataFrame(sdict, index=self.index, columns=columns,
  477. default_fill_value=self._default_fill_value)
  478. def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, limit=None,
  479. copy=False, allow_dups=False):
  480. if method is not None or limit is not None:
  481. raise NotImplementedError("cannot reindex with a method or limit with sparse")
  482. if fill_value is None:
  483. fill_value = np.nan
  484. index, row_indexer = reindexers.get(0, (None, None))
  485. columns, col_indexer = reindexers.get(1, (None, None))
  486. if columns is None:
  487. columns = self.columns
  488. new_arrays = {}
  489. for col in columns:
  490. if col not in self:
  491. continue
  492. if row_indexer is not None:
  493. new_arrays[col] = com.take_1d(
  494. self[col].get_values(), row_indexer,
  495. fill_value=fill_value)
  496. else:
  497. new_arrays[col] = self[col]
  498. return SparseDataFrame(new_arrays, index=index, columns=columns).__finalize__(self)
  499. def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
  500. sort=False):
  501. if on is not None:
  502. raise NotImplementedError("'on' keyword parameter is not yet "
  503. "implemented")
  504. return self._join_index(other, how, lsuffix, rsuffix)
  505. def _join_index(self, other, how, lsuffix, rsuffix):
  506. if isinstance(other, Series):
  507. if other.name is None:
  508. raise ValueError('Other Series must have a name')
  509. other = SparseDataFrame({other.name: other},
  510. default_fill_value=self._default_fill_value)
  511. join_index = self.index.join(other.index, how=how)
  512. this = self.reindex(join_index)
  513. other = other.reindex(join_index)
  514. this, other = this._maybe_rename_join(other, lsuffix, rsuffix)
  515. from pandas import concat
  516. return concat([this, other], axis=1, verify_integrity=True)
  517. def _maybe_rename_join(self, other, lsuffix, rsuffix):
  518. to_rename = self.columns.intersection(other.columns)
  519. if len(to_rename) > 0:
  520. if not lsuffix and not rsuffix:
  521. raise ValueError('columns overlap but no suffix specified: %s'
  522. % to_rename)
  523. def lrenamer(x):
  524. if x in to_rename:
  525. return '%s%s' % (x, lsuffix)
  526. return x
  527. def rrenamer(x):
  528. if x in to_rename:
  529. return '%s%s' % (x, rsuffix)
  530. return x
  531. this = self.rename(columns=lrenamer)
  532. other = other.rename(columns=rrenamer)
  533. else:
  534. this = self
  535. return this, other
  536. def transpose(self):
  537. """
  538. Returns a DataFrame with the rows/columns switched.
  539. """
  540. return SparseDataFrame(self.values.T, index=self.columns,
  541. columns=self.index,
  542. default_fill_value=self._default_fill_value,
  543. default_kind=self._default_kind).__finalize__(self)
  544. T = property(transpose)
  545. @Appender(DataFrame.count.__doc__)
  546. def count(self, axis=0, **kwds):
  547. return self.apply(lambda x: x.count(), axis=axis)
  548. def cumsum(self, axis=0):
  549. """
  550. Return SparseDataFrame of cumulative sums over requested axis.
  551. Parameters
  552. ----------
  553. axis : {0, 1}
  554. 0 for row-wise, 1 for column-wise
  555. Returns
  556. -------
  557. y : SparseDataFrame
  558. """
  559. return self.apply(lambda x: x.cumsum(), axis=axis)
  560. def apply(self, func, axis=0, broadcast=False, reduce=False):
  561. """
  562. Analogous to DataFrame.apply, for SparseDataFrame
  563. Parameters
  564. ----------
  565. func : function
  566. Function to apply to each column
  567. axis : {0, 1, 'index', 'columns'}
  568. broadcast : bool, default False
  569. For aggregation functions, return object of same size with values
  570. propagated
  571. Returns
  572. -------
  573. applied : Series or SparseDataFrame
  574. """
  575. if not len(self.columns):
  576. return self
  577. axis = self._get_axis_number(axis)
  578. if isinstance(func, np.ufunc):
  579. new_series = {}
  580. for k, v in compat.iteritems(self):
  581. applied = func(v)
  582. applied.fill_value = func(applied.fill_value)
  583. new_series[k] = applied
  584. return self._constructor(new_series, index=self.index,
  585. columns=self.columns,
  586. default_fill_value=self._default_fill_value,
  587. kind=self._default_kind).__finalize__(self)
  588. else:
  589. if not broadcast:
  590. return self._apply_standard(func, axis, reduce=reduce)
  591. else:
  592. return self._apply_broadcast(func, axis)
  593. def applymap(self, func):
  594. """
  595. Apply a function to a DataFrame that is intended to operate
  596. elementwise, i.e. like doing map(func, series) for each series in the
  597. DataFrame
  598. Parameters
  599. ----------
  600. func : function
  601. Python function, returns a single value from a single value
  602. Returns
  603. -------
  604. applied : DataFrame
  605. """
  606. return self.apply(lambda x: lmap(func, x))
  607. def dict_to_manager(sdict, columns, index):
  608. """ create and return the block manager from a dict of series, columns, index """
  609. # from BlockManager perspective
  610. axes = [_ensure_index(columns), _ensure_index(index)]
  611. return create_block_manager_from_arrays([sdict[c] for c in columns], columns, axes)
  612. def stack_sparse_frame(frame):
  613. """
  614. Only makes sense when fill_value is NaN
  615. """
  616. lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)]
  617. nobs = sum(lengths)
  618. # this is pretty fast
  619. minor_labels = np.repeat(np.arange(len(frame.columns)), lengths)
  620. inds_to_concat = []
  621. vals_to_concat = []
  622. # TODO: Figure out whether this can be reached.
  623. # I think this currently can't be reached because you can't build a SparseDataFrame
  624. # with a non-np.NaN fill value (fails earlier).
  625. for _, series in compat.iteritems(frame):
  626. if not np.isnan(series.fill_value):
  627. raise TypeError('This routine assumes NaN fill value')
  628. int_index = series.sp_index.to_int_index()
  629. inds_to_concat.append(int_index.indices)
  630. vals_to_concat.append(series.sp_values)
  631. major_labels = np.concatenate(inds_to_concat)
  632. stacked_values = np.concatenate(vals_to_concat)
  633. index = MultiIndex(levels=[frame.index, frame.columns],
  634. labels=[major_labels, minor_labels],
  635. verify_integrity=False)
  636. lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index,
  637. columns=['foo'])
  638. return lp.sortlevel(level=0)
  639. def homogenize(series_dict):
  640. """
  641. Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex
  642. corresponding to the locations where they all have data
  643. Parameters
  644. ----------
  645. series_dict : dict or DataFrame
  646. Notes
  647. -----
  648. Using the dumbest algorithm I could think of. Should put some more thought
  649. into this
  650. Returns
  651. -------
  652. homogenized : dict of SparseSeries
  653. """
  654. index = None
  655. need_reindex = False
  656. for _, series in compat.iteritems(series_dict):
  657. if not np.isnan(series.fill_value):
  658. raise TypeError('this method is only valid with NaN fill values')
  659. if index is None:
  660. index = series.sp_index
  661. elif not series.sp_index.equals(index):
  662. need_reindex = True
  663. index = index.intersect(series.sp_index)
  664. if need_reindex:
  665. output = {}
  666. for name, series in compat.iteritems(series_dict):
  667. if not series.sp_index.equals(index):
  668. series = series.sparse_reindex(index)
  669. output[name] = series
  670. else:
  671. output = series_dict
  672. return output
  673. # use unaccelerated ops for sparse objects
  674. ops.add_flex_arithmetic_methods(SparseDataFrame, use_numexpr=False,
  675. **ops.frame_flex_funcs)
  676. ops.add_special_arithmetic_methods(SparseDataFrame, use_numexpr=False,
  677. **ops.frame_special_funcs)