PageRenderTime 48ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/sparse/frame.py

http://github.com/wesm/pandas
Python | 838 lines | 669 code | 85 blank | 84 comment | 87 complexity | b509f7cbab74990abffd634250bc106e MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. Data structures for sparse float data. Life is made simpler by dealing only
  3. with float64 data
  4. """
  5. from __future__ import division
  6. # pylint: disable=E1101,E1103,W0231,E0202
  7. from numpy import nan
  8. from pandas.compat import lmap
  9. from pandas import compat
  10. import numpy as np
  11. from pandas.types.missing import isnull, notnull
  12. from pandas.types.common import _ensure_platform_int
  13. from pandas.core.common import _try_sort
  14. from pandas.compat.numpy import function as nv
  15. from pandas.core.index import Index, MultiIndex, _ensure_index
  16. from pandas.core.series import Series
  17. from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray,
  18. _default_index)
  19. import pandas.core.algorithms as algos
  20. from pandas.core.internals import (BlockManager,
  21. create_block_manager_from_arrays)
  22. from pandas.core.generic import NDFrame
  23. from pandas.sparse.series import SparseSeries, SparseArray
  24. from pandas.util.decorators import Appender
  25. import pandas.core.ops as ops
  26. class SparseDataFrame(DataFrame):
  27. """
  28. DataFrame containing sparse floating point data in the form of SparseSeries
  29. objects
  30. Parameters
  31. ----------
  32. data : same types as can be passed to DataFrame
  33. index : array-like, optional
  34. column : array-like, optional
  35. default_kind : {'block', 'integer'}, default 'block'
  36. Default sparse kind for converting Series to SparseSeries. Will not
  37. override SparseSeries passed into constructor
  38. default_fill_value : float
  39. Default fill_value for converting Series to SparseSeries. Will not
  40. override SparseSeries passed in
  41. """
  42. _constructor_sliced = SparseSeries
  43. _subtyp = 'sparse_frame'
  44. def __init__(self, data=None, index=None, columns=None, default_kind=None,
  45. default_fill_value=None, dtype=None, copy=False):
  46. # pick up the defaults from the Sparse structures
  47. if isinstance(data, SparseDataFrame):
  48. if index is None:
  49. index = data.index
  50. if columns is None:
  51. columns = data.columns
  52. if default_fill_value is None:
  53. default_fill_value = data.default_fill_value
  54. if default_kind is None:
  55. default_kind = data.default_kind
  56. elif isinstance(data, (SparseSeries, SparseArray)):
  57. if index is None:
  58. index = data.index
  59. if default_fill_value is None:
  60. default_fill_value = data.fill_value
  61. if columns is None and hasattr(data, 'name'):
  62. columns = [data.name]
  63. if columns is None:
  64. raise Exception("cannot pass a series w/o a name or columns")
  65. data = {columns[0]: data}
  66. if default_fill_value is None:
  67. default_fill_value = np.nan
  68. if default_kind is None:
  69. default_kind = 'block'
  70. self._default_kind = default_kind
  71. self._default_fill_value = default_fill_value
  72. if isinstance(data, dict):
  73. mgr = self._init_dict(data, index, columns)
  74. if dtype is not None:
  75. mgr = mgr.astype(dtype)
  76. elif isinstance(data, (np.ndarray, list)):
  77. mgr = self._init_matrix(data, index, columns)
  78. if dtype is not None:
  79. mgr = mgr.astype(dtype)
  80. elif isinstance(data, SparseDataFrame):
  81. mgr = self._init_mgr(data._data,
  82. dict(index=index, columns=columns),
  83. dtype=dtype, copy=copy)
  84. elif isinstance(data, DataFrame):
  85. mgr = self._init_dict(data, data.index, data.columns)
  86. if dtype is not None:
  87. mgr = mgr.astype(dtype)
  88. elif isinstance(data, BlockManager):
  89. mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
  90. dtype=dtype, copy=copy)
  91. elif data is None:
  92. data = DataFrame()
  93. if index is None:
  94. index = Index([])
  95. else:
  96. index = _ensure_index(index)
  97. if columns is None:
  98. columns = Index([])
  99. else:
  100. for c in columns:
  101. data[c] = SparseArray(np.nan, index=index,
  102. kind=self._default_kind,
  103. fill_value=self._default_fill_value)
  104. mgr = to_manager(data, columns, index)
  105. if dtype is not None:
  106. mgr = mgr.astype(dtype)
  107. NDFrame.__init__(self, mgr)
  108. @property
  109. def _constructor(self):
  110. return SparseDataFrame
  111. _constructor_sliced = SparseSeries
  112. def _init_dict(self, data, index, columns, dtype=None):
  113. # pre-filter out columns if we passed it
  114. if columns is not None:
  115. columns = _ensure_index(columns)
  116. data = dict((k, v) for k, v in compat.iteritems(data)
  117. if k in columns)
  118. else:
  119. columns = Index(_try_sort(list(data.keys())))
  120. if index is None:
  121. index = extract_index(list(data.values()))
  122. sp_maker = lambda x: SparseArray(x, kind=self._default_kind,
  123. fill_value=self._default_fill_value,
  124. copy=True)
  125. sdict = DataFrame()
  126. for k, v in compat.iteritems(data):
  127. if isinstance(v, Series):
  128. # Force alignment, no copy necessary
  129. if not v.index.equals(index):
  130. v = v.reindex(index)
  131. if not isinstance(v, SparseSeries):
  132. v = sp_maker(v.values)
  133. elif isinstance(v, SparseArray):
  134. v = v.copy()
  135. else:
  136. if isinstance(v, dict):
  137. v = [v.get(i, nan) for i in index]
  138. v = sp_maker(v)
  139. sdict[k] = v
  140. # TODO: figure out how to handle this case, all nan's?
  141. # add in any other columns we want to have (completeness)
  142. nan_vec = np.empty(len(index))
  143. nan_vec.fill(nan)
  144. for c in columns:
  145. if c not in sdict:
  146. sdict[c] = sp_maker(nan_vec)
  147. return to_manager(sdict, columns, index)
  148. def _init_matrix(self, data, index, columns, dtype=None):
  149. data = _prep_ndarray(data, copy=False)
  150. N, K = data.shape
  151. if index is None:
  152. index = _default_index(N)
  153. if columns is None:
  154. columns = _default_index(K)
  155. if len(columns) != K:
  156. raise ValueError('Column length mismatch: %d vs. %d' %
  157. (len(columns), K))
  158. if len(index) != N:
  159. raise ValueError('Index length mismatch: %d vs. %d' %
  160. (len(index), N))
  161. data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
  162. return self._init_dict(data, index, columns, dtype)
  163. def __array_wrap__(self, result):
  164. return self._constructor(
  165. result, index=self.index, columns=self.columns,
  166. default_kind=self._default_kind,
  167. default_fill_value=self._default_fill_value).__finalize__(self)
  168. def __getstate__(self):
  169. # pickling
  170. return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data,
  171. _default_fill_value=self._default_fill_value,
  172. _default_kind=self._default_kind)
  173. def _unpickle_sparse_frame_compat(self, state):
  174. """ original pickle format """
  175. series, cols, idx, fv, kind = state
  176. if not isinstance(cols, Index): # pragma: no cover
  177. from pandas.io.pickle import _unpickle_array
  178. columns = _unpickle_array(cols)
  179. else:
  180. columns = cols
  181. if not isinstance(idx, Index): # pragma: no cover
  182. from pandas.io.pickle import _unpickle_array
  183. index = _unpickle_array(idx)
  184. else:
  185. index = idx
  186. series_dict = DataFrame()
  187. for col, (sp_index, sp_values) in compat.iteritems(series):
  188. series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index,
  189. fill_value=fv)
  190. self._data = to_manager(series_dict, columns, index)
  191. self._default_fill_value = fv
  192. self._default_kind = kind
  193. def to_dense(self):
  194. """
  195. Convert to dense DataFrame
  196. Returns
  197. -------
  198. df : DataFrame
  199. """
  200. data = dict((k, v.to_dense()) for k, v in compat.iteritems(self))
  201. return DataFrame(data, index=self.index, columns=self.columns)
  202. def _apply_columns(self, func):
  203. """ get new SparseDataFrame applying func to each columns """
  204. new_data = {}
  205. for col, series in compat.iteritems(self):
  206. new_data[col] = func(series)
  207. return self._constructor(
  208. data=new_data, index=self.index, columns=self.columns,
  209. default_fill_value=self.default_fill_value).__finalize__(self)
  210. def astype(self, dtype):
  211. return self._apply_columns(lambda x: x.astype(dtype))
  212. def copy(self, deep=True):
  213. """
  214. Make a copy of this SparseDataFrame
  215. """
  216. result = super(SparseDataFrame, self).copy(deep=deep)
  217. result._default_fill_value = self._default_fill_value
  218. result._default_kind = self._default_kind
  219. return result
  220. @property
  221. def default_fill_value(self):
  222. return self._default_fill_value
  223. @property
  224. def default_kind(self):
  225. return self._default_kind
  226. @property
  227. def density(self):
  228. """
  229. Ratio of non-sparse points to total (dense) data points
  230. represented in the frame
  231. """
  232. tot_nonsparse = sum([ser.sp_index.npoints
  233. for _, ser in compat.iteritems(self)])
  234. tot = len(self.index) * len(self.columns)
  235. return tot_nonsparse / float(tot)
  236. def fillna(self, value=None, method=None, axis=0, inplace=False,
  237. limit=None, downcast=None):
  238. new_self = super(SparseDataFrame,
  239. self).fillna(value=value, method=method, axis=axis,
  240. inplace=inplace, limit=limit,
  241. downcast=downcast)
  242. if not inplace:
  243. self = new_self
  244. # set the fill value if we are filling as a scalar with nothing special
  245. # going on
  246. if (value is not None and value == value and method is None and
  247. limit is None):
  248. self._default_fill_value = value
  249. if not inplace:
  250. return self
  251. # ----------------------------------------------------------------------
  252. # Support different internal representation of SparseDataFrame
  253. def _sanitize_column(self, key, value):
  254. sp_maker = lambda x, index=None: SparseArray(
  255. x, index=index, fill_value=self._default_fill_value,
  256. kind=self._default_kind)
  257. if isinstance(value, SparseSeries):
  258. clean = value.reindex(self.index).as_sparse_array(
  259. fill_value=self._default_fill_value, kind=self._default_kind)
  260. elif isinstance(value, SparseArray):
  261. if len(value) != len(self.index):
  262. raise AssertionError('Length of values does not match '
  263. 'length of index')
  264. clean = value
  265. elif hasattr(value, '__iter__'):
  266. if isinstance(value, Series):
  267. clean = value.reindex(self.index)
  268. if not isinstance(value, SparseSeries):
  269. clean = sp_maker(clean)
  270. else:
  271. if len(value) != len(self.index):
  272. raise AssertionError('Length of values does not match '
  273. 'length of index')
  274. clean = sp_maker(value)
  275. # Scalar
  276. else:
  277. clean = sp_maker(value, self.index)
  278. # always return a SparseArray!
  279. return clean
  280. def __getitem__(self, key):
  281. """
  282. Retrieve column or slice from DataFrame
  283. """
  284. if isinstance(key, slice):
  285. date_rng = self.index[key]
  286. return self.reindex(date_rng)
  287. elif isinstance(key, (np.ndarray, list, Series)):
  288. return self._getitem_array(key)
  289. else:
  290. return self._get_item_cache(key)
  291. @Appender(DataFrame.get_value.__doc__, indents=0)
  292. def get_value(self, index, col, takeable=False):
  293. if takeable is True:
  294. series = self._iget_item_cache(col)
  295. else:
  296. series = self._get_item_cache(col)
  297. return series.get_value(index, takeable=takeable)
  298. def set_value(self, index, col, value, takeable=False):
  299. """
  300. Put single value at passed column and index
  301. Parameters
  302. ----------
  303. index : row label
  304. col : column label
  305. value : scalar value
  306. takeable : interpret the index/col as indexers, default False
  307. Notes
  308. -----
  309. This method *always* returns a new object. It is currently not
  310. particularly efficient (and potentially very expensive) but is provided
  311. for API compatibility with DataFrame
  312. Returns
  313. -------
  314. frame : DataFrame
  315. """
  316. dense = self.to_dense().set_value(index, col, value, takeable=takeable)
  317. return dense.to_sparse(kind=self._default_kind,
  318. fill_value=self._default_fill_value)
  319. def _slice(self, slobj, axis=0, kind=None):
  320. if axis == 0:
  321. new_index = self.index[slobj]
  322. new_columns = self.columns
  323. else:
  324. new_index = self.index
  325. new_columns = self.columns[slobj]
  326. return self.reindex(index=new_index, columns=new_columns)
  327. def xs(self, key, axis=0, copy=False):
  328. """
  329. Returns a row (cross-section) from the SparseDataFrame as a Series
  330. object.
  331. Parameters
  332. ----------
  333. key : some index contained in the index
  334. Returns
  335. -------
  336. xs : Series
  337. """
  338. if axis == 1:
  339. data = self[key]
  340. return data
  341. i = self.index.get_loc(key)
  342. data = self.take([i]).get_values()[0]
  343. return Series(data, index=self.columns)
  344. # ----------------------------------------------------------------------
  345. # Arithmetic-related methods
  346. def _combine_frame(self, other, func, fill_value=None, level=None):
  347. this, other = self.align(other, join='outer', level=level, copy=False)
  348. new_index, new_columns = this.index, this.columns
  349. if level is not None:
  350. raise NotImplementedError("'level' argument is not supported")
  351. if self.empty and other.empty:
  352. return self._constructor(index=new_index).__finalize__(self)
  353. new_data = {}
  354. new_fill_value = None
  355. if fill_value is not None:
  356. # TODO: be a bit more intelligent here
  357. for col in new_columns:
  358. if col in this and col in other:
  359. dleft = this[col].to_dense()
  360. dright = other[col].to_dense()
  361. result = dleft._binop(dright, func, fill_value=fill_value)
  362. result = result.to_sparse(fill_value=this[col].fill_value)
  363. new_data[col] = result
  364. else:
  365. for col in new_columns:
  366. if col in this and col in other:
  367. new_data[col] = func(this[col], other[col])
  368. # if the fill values are the same use them? or use a valid one
  369. other_fill_value = getattr(other, 'default_fill_value', np.nan)
  370. if self.default_fill_value == other_fill_value:
  371. new_fill_value = self.default_fill_value
  372. elif np.isnan(self.default_fill_value) and not np.isnan(
  373. other_fill_value):
  374. new_fill_value = other_fill_value
  375. elif not np.isnan(self.default_fill_value) and np.isnan(
  376. other_fill_value):
  377. new_fill_value = self.default_fill_value
  378. return self._constructor(data=new_data, index=new_index,
  379. columns=new_columns,
  380. default_fill_value=new_fill_value
  381. ).__finalize__(self)
  382. def _combine_match_index(self, other, func, level=None, fill_value=None):
  383. new_data = {}
  384. if fill_value is not None:
  385. raise NotImplementedError("'fill_value' argument is not supported")
  386. if level is not None:
  387. raise NotImplementedError("'level' argument is not supported")
  388. new_index = self.index.union(other.index)
  389. this = self
  390. if self.index is not new_index:
  391. this = self.reindex(new_index)
  392. if other.index is not new_index:
  393. other = other.reindex(new_index)
  394. for col, series in compat.iteritems(this):
  395. new_data[col] = func(series.values, other.values)
  396. # fill_value is a function of our operator
  397. if isnull(other.fill_value) or isnull(self.default_fill_value):
  398. fill_value = np.nan
  399. else:
  400. fill_value = func(np.float64(self.default_fill_value),
  401. np.float64(other.fill_value))
  402. return self._constructor(
  403. new_data, index=new_index, columns=self.columns,
  404. default_fill_value=fill_value).__finalize__(self)
  405. def _combine_match_columns(self, other, func, level=None, fill_value=None):
  406. # patched version of DataFrame._combine_match_columns to account for
  407. # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series,
  408. # where 3.0 is numpy.float64 and series is a SparseSeries. Still
  409. # possible for this to happen, which is bothersome
  410. if fill_value is not None:
  411. raise NotImplementedError("'fill_value' argument is not supported")
  412. if level is not None:
  413. raise NotImplementedError("'level' argument is not supported")
  414. new_data = {}
  415. union = intersection = self.columns
  416. if not union.equals(other.index):
  417. union = other.index.union(self.columns)
  418. intersection = other.index.intersection(self.columns)
  419. for col in intersection:
  420. new_data[col] = func(self[col], float(other[col]))
  421. return self._constructor(
  422. new_data, index=self.index, columns=union,
  423. default_fill_value=self.default_fill_value).__finalize__(self)
  424. def _combine_const(self, other, func):
  425. return self._apply_columns(lambda x: func(x, other))
  426. def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
  427. limit=None, takeable=False):
  428. if level is not None:
  429. raise TypeError('Reindex by level not supported for sparse')
  430. if self.index.equals(index):
  431. if copy:
  432. return self.copy()
  433. else:
  434. return self
  435. if len(self.index) == 0:
  436. return self._constructor(
  437. index=index, columns=self.columns).__finalize__(self)
  438. indexer = self.index.get_indexer(index, method, limit=limit)
  439. indexer = _ensure_platform_int(indexer)
  440. mask = indexer == -1
  441. need_mask = mask.any()
  442. new_series = {}
  443. for col, series in self.iteritems():
  444. if mask.all():
  445. continue
  446. values = series.values
  447. # .take returns SparseArray
  448. new = values.take(indexer)
  449. if need_mask:
  450. new = new.values
  451. np.putmask(new, mask, fill_value)
  452. new_series[col] = new
  453. return self._constructor(
  454. new_series, index=index, columns=self.columns,
  455. default_fill_value=self._default_fill_value).__finalize__(self)
  456. def _reindex_columns(self, columns, copy, level, fill_value, limit=None,
  457. takeable=False):
  458. if level is not None:
  459. raise TypeError('Reindex by level not supported for sparse')
  460. if notnull(fill_value):
  461. raise NotImplementedError("'fill_value' argument is not supported")
  462. if limit:
  463. raise NotImplementedError("'limit' argument is not supported")
  464. # TODO: fill value handling
  465. sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns)
  466. return self._constructor(
  467. sdict, index=self.index, columns=columns,
  468. default_fill_value=self._default_fill_value).__finalize__(self)
  469. def _reindex_with_indexers(self, reindexers, method=None, fill_value=None,
  470. limit=None, copy=False, allow_dups=False):
  471. if method is not None or limit is not None:
  472. raise NotImplementedError("cannot reindex with a method or limit "
  473. "with sparse")
  474. if fill_value is None:
  475. fill_value = np.nan
  476. index, row_indexer = reindexers.get(0, (None, None))
  477. columns, col_indexer = reindexers.get(1, (None, None))
  478. if columns is None:
  479. columns = self.columns
  480. new_arrays = {}
  481. for col in columns:
  482. if col not in self:
  483. continue
  484. if row_indexer is not None:
  485. new_arrays[col] = algos.take_1d(self[col].get_values(),
  486. row_indexer,
  487. fill_value=fill_value)
  488. else:
  489. new_arrays[col] = self[col]
  490. return self._constructor(new_arrays, index=index,
  491. columns=columns).__finalize__(self)
  492. def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
  493. sort=False):
  494. if on is not None:
  495. raise NotImplementedError("'on' keyword parameter is not yet "
  496. "implemented")
  497. return self._join_index(other, how, lsuffix, rsuffix)
  498. def _join_index(self, other, how, lsuffix, rsuffix):
  499. if isinstance(other, Series):
  500. if other.name is None:
  501. raise ValueError('Other Series must have a name')
  502. other = SparseDataFrame(
  503. {other.name: other},
  504. default_fill_value=self._default_fill_value)
  505. join_index = self.index.join(other.index, how=how)
  506. this = self.reindex(join_index)
  507. other = other.reindex(join_index)
  508. this, other = this._maybe_rename_join(other, lsuffix, rsuffix)
  509. from pandas import concat
  510. return concat([this, other], axis=1, verify_integrity=True)
  511. def _maybe_rename_join(self, other, lsuffix, rsuffix):
  512. to_rename = self.columns.intersection(other.columns)
  513. if len(to_rename) > 0:
  514. if not lsuffix and not rsuffix:
  515. raise ValueError('columns overlap but no suffix specified: %s'
  516. % to_rename)
  517. def lrenamer(x):
  518. if x in to_rename:
  519. return '%s%s' % (x, lsuffix)
  520. return x
  521. def rrenamer(x):
  522. if x in to_rename:
  523. return '%s%s' % (x, rsuffix)
  524. return x
  525. this = self.rename(columns=lrenamer)
  526. other = other.rename(columns=rrenamer)
  527. else:
  528. this = self
  529. return this, other
  530. def transpose(self, *args, **kwargs):
  531. """
  532. Returns a DataFrame with the rows/columns switched.
  533. """
  534. nv.validate_transpose(args, kwargs)
  535. return self._constructor(
  536. self.values.T, index=self.columns, columns=self.index,
  537. default_fill_value=self._default_fill_value,
  538. default_kind=self._default_kind).__finalize__(self)
  539. T = property(transpose)
  540. @Appender(DataFrame.count.__doc__)
  541. def count(self, axis=0, **kwds):
  542. if axis is None:
  543. axis = self._stat_axis_number
  544. return self.apply(lambda x: x.count(), axis=axis)
  545. def cumsum(self, axis=0, *args, **kwargs):
  546. """
  547. Return SparseDataFrame of cumulative sums over requested axis.
  548. Parameters
  549. ----------
  550. axis : {0, 1}
  551. 0 for row-wise, 1 for column-wise
  552. Returns
  553. -------
  554. y : SparseDataFrame
  555. """
  556. nv.validate_cumsum(args, kwargs)
  557. if axis is None:
  558. axis = self._stat_axis_number
  559. return self.apply(lambda x: x.cumsum(), axis=axis)
  560. def apply(self, func, axis=0, broadcast=False, reduce=False):
  561. """
  562. Analogous to DataFrame.apply, for SparseDataFrame
  563. Parameters
  564. ----------
  565. func : function
  566. Function to apply to each column
  567. axis : {0, 1, 'index', 'columns'}
  568. broadcast : bool, default False
  569. For aggregation functions, return object of same size with values
  570. propagated
  571. Returns
  572. -------
  573. applied : Series or SparseDataFrame
  574. """
  575. if not len(self.columns):
  576. return self
  577. axis = self._get_axis_number(axis)
  578. if isinstance(func, np.ufunc):
  579. new_series = {}
  580. for k, v in compat.iteritems(self):
  581. applied = func(v)
  582. applied.fill_value = func(v.fill_value)
  583. new_series[k] = applied
  584. return self._constructor(
  585. new_series, index=self.index, columns=self.columns,
  586. default_fill_value=self._default_fill_value,
  587. default_kind=self._default_kind).__finalize__(self)
  588. else:
  589. if not broadcast:
  590. return self._apply_standard(func, axis, reduce=reduce)
  591. else:
  592. return self._apply_broadcast(func, axis)
  593. def applymap(self, func):
  594. """
  595. Apply a function to a DataFrame that is intended to operate
  596. elementwise, i.e. like doing map(func, series) for each series in the
  597. DataFrame
  598. Parameters
  599. ----------
  600. func : function
  601. Python function, returns a single value from a single value
  602. Returns
  603. -------
  604. applied : DataFrame
  605. """
  606. return self.apply(lambda x: lmap(func, x))
  607. def to_manager(sdf, columns, index):
  608. """ create and return the block manager from a dataframe of series,
  609. columns, index
  610. """
  611. # from BlockManager perspective
  612. axes = [_ensure_index(columns), _ensure_index(index)]
  613. return create_block_manager_from_arrays(
  614. [sdf[c] for c in columns], columns, axes)
  615. def stack_sparse_frame(frame):
  616. """
  617. Only makes sense when fill_value is NaN
  618. """
  619. lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)]
  620. nobs = sum(lengths)
  621. # this is pretty fast
  622. minor_labels = np.repeat(np.arange(len(frame.columns)), lengths)
  623. inds_to_concat = []
  624. vals_to_concat = []
  625. # TODO: Figure out whether this can be reached.
  626. # I think this currently can't be reached because you can't build a
  627. # SparseDataFrame with a non-np.NaN fill value (fails earlier).
  628. for _, series in compat.iteritems(frame):
  629. if not np.isnan(series.fill_value):
  630. raise TypeError('This routine assumes NaN fill value')
  631. int_index = series.sp_index.to_int_index()
  632. inds_to_concat.append(int_index.indices)
  633. vals_to_concat.append(series.sp_values)
  634. major_labels = np.concatenate(inds_to_concat)
  635. stacked_values = np.concatenate(vals_to_concat)
  636. index = MultiIndex(levels=[frame.index, frame.columns],
  637. labels=[major_labels, minor_labels],
  638. verify_integrity=False)
  639. lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index,
  640. columns=['foo'])
  641. return lp.sortlevel(level=0)
  642. def homogenize(series_dict):
  643. """
  644. Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex
  645. corresponding to the locations where they all have data
  646. Parameters
  647. ----------
  648. series_dict : dict or DataFrame
  649. Notes
  650. -----
  651. Using the dumbest algorithm I could think of. Should put some more thought
  652. into this
  653. Returns
  654. -------
  655. homogenized : dict of SparseSeries
  656. """
  657. index = None
  658. need_reindex = False
  659. for _, series in compat.iteritems(series_dict):
  660. if not np.isnan(series.fill_value):
  661. raise TypeError('this method is only valid with NaN fill values')
  662. if index is None:
  663. index = series.sp_index
  664. elif not series.sp_index.equals(index):
  665. need_reindex = True
  666. index = index.intersect(series.sp_index)
  667. if need_reindex:
  668. output = {}
  669. for name, series in compat.iteritems(series_dict):
  670. if not series.sp_index.equals(index):
  671. series = series.sparse_reindex(index)
  672. output[name] = series
  673. else:
  674. output = series_dict
  675. return output
  676. # use unaccelerated ops for sparse objects
  677. ops.add_flex_arithmetic_methods(SparseDataFrame, use_numexpr=False,
  678. **ops.frame_flex_funcs)
  679. ops.add_special_arithmetic_methods(SparseDataFrame, use_numexpr=False,
  680. **ops.frame_special_funcs)