PageRenderTime 59ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/core/internals.py

https://github.com/benracine/pandas
Python | 1200 lines | 1002 code | 82 blank | 116 comment | 72 complexity | 7089d19120ccdf5e0051a64531ce7265 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import itertools
  2. from numpy import nan
  3. import numpy as np
  4. from pandas.core.index import Index, _ensure_index
  5. from pandas.util.decorators import cache_readonly
  6. import pandas.core.common as common
  7. import pandas._tseries as lib
  8. class Block(object):
  9. """
  10. Canonical n-dimensional unit of homogeneous dtype contained in a pandas data
  11. structure
  12. Index-ignorant; let the container take care of that
  13. """
  14. __slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim']
  15. def __init__(self, values, items, ref_items, ndim=2,
  16. do_integrity_check=False):
  17. if issubclass(values.dtype.type, basestring):
  18. values = np.array(values, dtype=object)
  19. assert(values.ndim == ndim)
  20. assert(len(items) == len(values))
  21. self.values = values
  22. self.ndim = ndim
  23. self.items = _ensure_index(items)
  24. self.ref_items = _ensure_index(ref_items)
  25. if do_integrity_check:
  26. self._check_integrity()
  27. def _check_integrity(self):
  28. if len(self.items) < 2:
  29. return
  30. # monotonicity
  31. return (self.ref_locs[1:] > self.ref_locs[:-1]).all()
  32. _ref_locs = None
  33. @property
  34. def ref_locs(self):
  35. if self._ref_locs is None:
  36. indexer = self.ref_items.get_indexer(self.items)
  37. assert((indexer != -1).all())
  38. self._ref_locs = indexer
  39. return self._ref_locs
  40. def set_ref_items(self, ref_items, maybe_rename=True):
  41. """
  42. If maybe_rename=True, need to set the items for this guy
  43. """
  44. assert(isinstance(ref_items, Index))
  45. if maybe_rename:
  46. self.items = ref_items.take(self.ref_locs)
  47. self.ref_items = ref_items
  48. def __repr__(self):
  49. shape = ' x '.join([str(s) for s in self.shape])
  50. name = type(self).__name__
  51. return '%s: %s, %s, dtype %s' % (name, self.items, shape, self.dtype)
  52. def __contains__(self, item):
  53. return item in self.items
  54. def __len__(self):
  55. return len(self.values)
  56. def __getstate__(self):
  57. # should not pickle generally (want to share ref_items), but here for
  58. # completeness
  59. return (self.items, self.ref_items, self.values)
  60. def __setstate__(self, state):
  61. items, ref_items, values = state
  62. self.items = _ensure_index(items)
  63. self.ref_items = _ensure_index(ref_items)
  64. self.values = values
  65. self.ndim = values.ndim
  66. @property
  67. def shape(self):
  68. return self.values.shape
  69. @property
  70. def dtype(self):
  71. return self.values.dtype
  72. def copy(self, deep=True):
  73. values = self.values
  74. if deep:
  75. values = values.copy()
  76. return make_block(values, self.items, self.ref_items)
  77. def merge(self, other):
  78. assert(self.ref_items.equals(other.ref_items))
  79. # Not sure whether to allow this or not
  80. # if not union_ref.equals(other.ref_items):
  81. # union_ref = self.ref_items + other.ref_items
  82. return _merge_blocks([self, other], self.ref_items)
  83. def reindex_axis(self, indexer, mask, needs_masking, axis=0):
  84. """
  85. Reindex using pre-computed indexer information
  86. """
  87. if self.values.size > 0:
  88. new_values = common.take_fast(self.values, indexer, mask,
  89. needs_masking, axis=axis)
  90. else:
  91. shape = list(self.shape)
  92. shape[axis] = len(indexer)
  93. new_values = np.empty(shape)
  94. new_values.fill(np.nan)
  95. return make_block(new_values, self.items, self.ref_items)
  96. def reindex_items_from(self, new_ref_items):
  97. """
  98. Reindex to only those items contained in the input set of items
  99. E.g. if you have ['a', 'b'], and the input items is ['b', 'c', 'd'],
  100. then the resulting items will be ['b']
  101. Returns
  102. -------
  103. reindexed : Block
  104. """
  105. new_ref_items, indexer = self.items.reindex(new_ref_items)
  106. mask = indexer != -1
  107. masked_idx = indexer[mask]
  108. new_values = self.values.take(masked_idx, axis=0)
  109. new_items = self.items.take(masked_idx)
  110. return make_block(new_values, new_items, new_ref_items)
  111. def get(self, item):
  112. loc = self.items.get_loc(item)
  113. return self.values[loc]
  114. def set(self, item, value):
  115. """
  116. Modify Block in-place with new item value
  117. Returns
  118. -------
  119. None
  120. """
  121. loc = self.items.get_loc(item)
  122. self.values[loc] = value
  123. def delete(self, item):
  124. """
  125. Returns
  126. -------
  127. y : Block (new object)
  128. """
  129. loc = self.items.get_loc(item)
  130. new_items = self.items.delete(loc)
  131. new_values = np.delete(self.values, loc, 0)
  132. return make_block(new_values, new_items, self.ref_items)
  133. def fillna(self, value):
  134. new_values = self.values.copy()
  135. mask = common.isnull(new_values.ravel())
  136. new_values.flat[mask] = value
  137. return make_block(new_values, self.items, self.ref_items)
  138. #-------------------------------------------------------------------------------
  139. # Is this even possible?
  140. class FloatBlock(Block):
  141. def should_store(self, value):
  142. # when inserting a column should not coerce integers to floats
  143. # unnecessarily
  144. return issubclass(value.dtype.type, np.floating)
  145. class IntBlock(Block):
  146. def should_store(self, value):
  147. return issubclass(value.dtype.type, np.integer)
  148. class BoolBlock(Block):
  149. def should_store(self, value):
  150. return issubclass(value.dtype.type, np.bool_)
  151. class ObjectBlock(Block):
  152. def should_store(self, value):
  153. return not issubclass(value.dtype.type,
  154. (np.integer, np.floating, np.bool_))
  155. def make_block(values, items, ref_items, do_integrity_check=False):
  156. dtype = values.dtype
  157. vtype = dtype.type
  158. if issubclass(vtype, np.floating):
  159. klass = FloatBlock
  160. elif issubclass(vtype, np.integer):
  161. if vtype != np.int64:
  162. values = values.astype('i8')
  163. klass = IntBlock
  164. elif dtype == np.bool_:
  165. klass = BoolBlock
  166. else:
  167. klass = ObjectBlock
  168. return klass(values, items, ref_items, ndim=values.ndim,
  169. do_integrity_check=do_integrity_check)
  170. # TODO: flexible with index=None and/or items=None
  171. class BlockManager(object):
  172. """
  173. Core internal data structure to implement DataFrame
  174. Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
  175. lightweight blocked set of labeled data to be manipulated by the DataFrame
  176. public API class
  177. Parameters
  178. ----------
  179. Notes
  180. -----
  181. This is *not* a public API class
  182. """
  183. __slots__ = ['axes', 'blocks', 'ndim']
  184. def __init__(self, blocks, axes, do_integrity_check=True):
  185. self.axes = [_ensure_index(ax) for ax in axes]
  186. self.blocks = blocks
  187. ndim = len(axes)
  188. for block in blocks:
  189. assert(ndim == block.values.ndim)
  190. if do_integrity_check:
  191. self._verify_integrity()
  192. @property
  193. def ndim(self):
  194. return len(self.axes)
  195. def is_mixed_dtype(self):
  196. counts = set()
  197. for block in self.blocks:
  198. counts.add(block.dtype)
  199. if len(counts) > 1:
  200. return True
  201. return False
  202. def set_axis(self, axis, value):
  203. cur_axis = self.axes[axis]
  204. if len(value) != len(cur_axis):
  205. raise Exception('Length mismatch (%d vs %d)'
  206. % (len(value), len(cur_axis)))
  207. self.axes[axis] = _ensure_index(value)
  208. if axis == 0:
  209. for block in self.blocks:
  210. block.set_ref_items(self.items, maybe_rename=True)
  211. # make items read only for now
  212. def _get_items(self):
  213. return self.axes[0]
  214. items = property(fget=_get_items)
  215. def set_items_norename(self, value):
  216. value = _ensure_index(value)
  217. self.axes[0] = value
  218. for block in self.blocks:
  219. block.set_ref_items(value, maybe_rename=False)
  220. def __getstate__(self):
  221. block_values = [b.values for b in self.blocks]
  222. block_items = [b.items for b in self.blocks]
  223. axes_array = [ax for ax in self.axes]
  224. return axes_array, block_values, block_items
  225. def __setstate__(self, state):
  226. # discard anything after 3rd, support beta pickling format for a little
  227. # while longer
  228. ax_arrays, bvalues, bitems = state[:3]
  229. self.axes = [_ensure_index(ax) for ax in ax_arrays]
  230. blocks = []
  231. for values, items in zip(bvalues, bitems):
  232. blk = make_block(values, items, self.axes[0],
  233. do_integrity_check=True)
  234. blocks.append(blk)
  235. self.blocks = blocks
  236. def __len__(self):
  237. return len(self.items)
  238. def __repr__(self):
  239. output = 'BlockManager'
  240. for i, ax in enumerate(self.axes):
  241. if i == 0:
  242. output += '\nItems: %s' % ax
  243. else:
  244. output += '\nAxis %d: %s' % (i, ax)
  245. for block in self.blocks:
  246. output += '\n%s' % repr(block)
  247. return output
  248. @property
  249. def shape(self):
  250. return tuple(len(ax) for ax in self.axes)
  251. def _verify_integrity(self):
  252. _union_block_items(self.blocks)
  253. mgr_shape = self.shape
  254. for block in self.blocks:
  255. assert(block.values.shape[1:] == mgr_shape[1:])
  256. tot_items = sum(len(x.items) for x in self.blocks)
  257. assert(len(self.items) == tot_items)
  258. def astype(self, dtype):
  259. new_blocks = []
  260. for block in self.blocks:
  261. newb = make_block(block.values.astype(dtype), block.items,
  262. block.ref_items)
  263. new_blocks.append(newb)
  264. new_mgr = BlockManager(new_blocks, self.axes)
  265. return new_mgr.consolidate()
  266. def is_consolidated(self):
  267. """
  268. Return True if more than one block with the same dtype
  269. """
  270. dtypes = [blk.dtype for blk in self.blocks]
  271. return len(dtypes) == len(set(dtypes))
  272. def get_slice(self, slobj, axis=0):
  273. new_axes = list(self.axes)
  274. new_axes[axis] = new_axes[axis][slobj]
  275. if axis == 0:
  276. new_items = new_axes[0]
  277. if len(self.blocks) == 1:
  278. blk = self.blocks[0]
  279. newb = make_block(blk.values[slobj], new_items,
  280. new_items)
  281. new_blocks = [newb]
  282. else:
  283. return self.reindex_items(new_items)
  284. else:
  285. new_blocks = self._slice_blocks(slobj, axis)
  286. return BlockManager(new_blocks, new_axes, do_integrity_check=False)
  287. def _slice_blocks(self, slobj, axis):
  288. new_blocks = []
  289. slicer = [slice(None, None) for _ in range(self.ndim)]
  290. slicer[axis] = slobj
  291. slicer = tuple(slicer)
  292. for block in self.blocks:
  293. newb = make_block(block.values[slicer], block.items,
  294. block.ref_items)
  295. new_blocks.append(newb)
  296. return new_blocks
  297. def get_series_dict(self):
  298. # For DataFrame
  299. return _blocks_to_series_dict(self.blocks, self.axes[1])
  300. @classmethod
  301. def from_blocks(cls, blocks, index):
  302. # also checks for overlap
  303. items = _union_block_items(blocks)
  304. return BlockManager(blocks, [items, index])
  305. def __contains__(self, item):
  306. return item in self.items
  307. @property
  308. def nblocks(self):
  309. return len(self.blocks)
  310. def copy(self, deep=True):
  311. """
  312. Make deep or shallow copy of BlockManager
  313. Parameters
  314. ----------
  315. deep : boolean, default True
  316. If False, return shallow copy (do not copy data)
  317. Returns
  318. -------
  319. copy : BlockManager
  320. """
  321. copy_blocks = [block.copy(deep=deep) for block in self.blocks]
  322. return BlockManager(copy_blocks, self.axes)
  323. def as_matrix(self, items=None):
  324. if len(self.blocks) == 0:
  325. mat = np.empty(self.shape, dtype=float)
  326. elif len(self.blocks) == 1:
  327. blk = self.blocks[0]
  328. if items is None or blk.items.equals(items):
  329. # if not, then just call interleave per below
  330. mat = blk.values
  331. else:
  332. mat = self.reindex_items(items).as_matrix()
  333. else:
  334. if items is None:
  335. mat = self._interleave(self.items)
  336. else:
  337. mat = self.reindex_items(items).as_matrix()
  338. return mat
  339. def _interleave(self, items):
  340. """
  341. Return ndarray from blocks with specified item order
  342. Items must be contained in the blocks
  343. """
  344. dtype = _interleaved_dtype(self.blocks)
  345. items = _ensure_index(items)
  346. result = np.empty(self.shape, dtype=dtype)
  347. itemmask = np.zeros(len(items), dtype=bool)
  348. # By construction, all of the item should be covered by one of the
  349. # blocks
  350. for block in self.blocks:
  351. indexer = items.get_indexer(block.items)
  352. assert((indexer != -1).all())
  353. result[indexer] = block.values
  354. itemmask[indexer] = 1
  355. assert(itemmask.all())
  356. return result
  357. def xs(self, key, axis=1, copy=True):
  358. assert(axis >= 1)
  359. loc = self.axes[axis].get_loc(key)
  360. slicer = [slice(None, None) for _ in range(self.ndim)]
  361. slicer[axis] = loc
  362. slicer = tuple(slicer)
  363. new_axes = list(self.axes)
  364. # could be an array indexer!
  365. if isinstance(loc, (slice, np.ndarray)):
  366. new_axes[axis] = new_axes[axis][loc]
  367. else:
  368. new_axes.pop(axis)
  369. new_blocks = []
  370. if len(self.blocks) > 1:
  371. if not copy:
  372. raise Exception('cannot get view of mixed-type or '
  373. 'non-consolidated DataFrame')
  374. for blk in self.blocks:
  375. newb = make_block(blk.values[slicer], blk.items, blk.ref_items)
  376. new_blocks.append(newb)
  377. elif len(self.blocks) == 1:
  378. vals = self.blocks[0].values[slicer]
  379. if copy:
  380. vals = vals.copy()
  381. new_blocks = [make_block(vals, self.items, self.items)]
  382. return BlockManager(new_blocks, new_axes)
  383. def fast_2d_xs(self, loc, copy=False):
  384. """
  385. """
  386. if len(self.blocks) == 1:
  387. result = self.blocks[0].values[:, loc]
  388. if copy:
  389. result = result.copy()
  390. return result
  391. if not copy:
  392. raise Exception('cannot get view of mixed-type or '
  393. 'non-consolidated DataFrame')
  394. dtype = _interleaved_dtype(self.blocks)
  395. items = self.items
  396. n = len(items)
  397. result = np.empty(n, dtype=dtype)
  398. for blk in self.blocks:
  399. values = blk.values
  400. for j, item in enumerate(blk.items):
  401. i = items.get_loc(item)
  402. result[i] = values[j, loc]
  403. return result
  404. def consolidate(self):
  405. """
  406. Join together blocks having same dtype
  407. Returns
  408. -------
  409. y : BlockManager
  410. """
  411. if self.is_consolidated():
  412. return self
  413. new_blocks = _consolidate(self.blocks, self.items)
  414. return BlockManager(new_blocks, self.axes)
  415. def get(self, item):
  416. _, block = self._find_block(item)
  417. return block.get(item)
  418. def get_scalar(self, tup):
  419. """
  420. Retrieve single item
  421. """
  422. item = tup[0]
  423. _, blk = self._find_block(item)
  424. # this could obviously be seriously sped up in cython
  425. item_loc = blk.items.get_loc(item),
  426. full_loc = item_loc + tuple(ax.get_loc(x)
  427. for ax, x in zip(self.axes[1:], tup[1:]))
  428. return blk.values[full_loc]
  429. def delete(self, item):
  430. i, _ = self._find_block(item)
  431. loc = self.items.get_loc(item)
  432. new_items = Index(np.delete(np.asarray(self.items), loc))
  433. self._delete_from_block(i, item)
  434. self.set_items_norename(new_items)
  435. def set(self, item, value):
  436. """
  437. Set new item in-place. Does not consolidate. Adds new Block if not
  438. contained in the current set of items
  439. """
  440. if value.ndim == self.ndim - 1:
  441. value = value.reshape((1,) + value.shape)
  442. assert(value.shape[1:] == self.shape[1:])
  443. if item in self.items:
  444. i, block = self._find_block(item)
  445. if not block.should_store(value):
  446. # delete from block, create and append new block
  447. self._delete_from_block(i, item)
  448. self._add_new_block(item, value)
  449. else:
  450. block.set(item, value)
  451. else:
  452. # insert at end
  453. self.insert(len(self.items), item, value)
  454. def insert(self, loc, item, value):
  455. if item in self.items:
  456. raise Exception('cannot insert %s, already exists' % item)
  457. new_items = self.items.insert(loc, item)
  458. self.set_items_norename(new_items)
  459. # new block
  460. self._add_new_block(item, value)
  461. def _delete_from_block(self, i, item):
  462. """
  463. Delete and maybe remove the whole block
  464. """
  465. block = self.blocks[i]
  466. newb = block.delete(item)
  467. if len(newb.ref_locs) == 0:
  468. self.blocks.pop(i)
  469. else:
  470. self.blocks[i] = newb
  471. def _add_new_block(self, item, value):
  472. # Do we care about dtype at the moment?
  473. # hm, elaborate hack?
  474. loc = self.items.get_loc(item)
  475. new_block = make_block(value, self.items[loc:loc+1], self.items)
  476. self.blocks.append(new_block)
  477. def _find_block(self, item):
  478. self._check_have(item)
  479. for i, block in enumerate(self.blocks):
  480. if item in block:
  481. return i, block
  482. def _check_have(self, item):
  483. if item not in self.items:
  484. raise KeyError('no item named %s' % str(item))
  485. def reindex_axis(self, new_axis, method=None, axis=0):
  486. if axis == 0:
  487. assert(method is None)
  488. return self.reindex_items(new_axis)
  489. new_axis = _ensure_index(new_axis)
  490. cur_axis = self.axes[axis]
  491. new_axis, indexer = cur_axis.reindex(new_axis, method)
  492. mask = indexer == -1
  493. # TODO: deal with length-0 case? or does it fall out?
  494. needs_masking = len(new_axis) > 0 and mask.any()
  495. new_blocks = []
  496. for block in self.blocks:
  497. newb = block.reindex_axis(indexer, mask, needs_masking,
  498. axis=axis)
  499. new_blocks.append(newb)
  500. new_axes = list(self.axes)
  501. new_axes[axis] = new_axis
  502. return BlockManager(new_blocks, new_axes)
  503. def reindex_indexer(self, new_axis, indexer, axis=1):
  504. """
  505. pandas-indexer with -1's only
  506. """
  507. if axis == 0:
  508. raise NotImplementedError
  509. new_axes = list(self.axes)
  510. new_axes[axis] = new_axis
  511. new_blocks = []
  512. for blk in self.blocks:
  513. new_values = common.take_fast(blk.values, indexer, None,
  514. False, axis=axis)
  515. newb = make_block(new_values, blk.items, self.items)
  516. new_blocks.append(newb)
  517. return BlockManager(new_blocks, new_axes)
  518. def reindex_items(self, new_items):
  519. """
  520. """
  521. new_items = _ensure_index(new_items)
  522. data = self
  523. if not data.is_consolidated():
  524. data = data.consolidate()
  525. return data.reindex_items(new_items)
  526. # TODO: this part could be faster (!)
  527. new_items, indexer = self.items.reindex(new_items)
  528. mask = indexer == -1
  529. new_blocks = []
  530. for block in self.blocks:
  531. newb = block.reindex_items_from(new_items)
  532. if len(newb.items) > 0:
  533. new_blocks.append(newb)
  534. if mask.any():
  535. extra_items = new_items[mask]
  536. block_shape = list(self.shape)
  537. block_shape[0] = len(extra_items)
  538. block_values = np.empty(block_shape, dtype=np.float64)
  539. block_values.fill(nan)
  540. na_block = make_block(block_values, extra_items, new_items,
  541. do_integrity_check=True)
  542. new_blocks.append(na_block)
  543. new_blocks = _consolidate(new_blocks, new_items)
  544. new_axes = list(self.axes)
  545. new_axes[0] = new_items
  546. return BlockManager(new_blocks, new_axes)
  547. def take(self, indexer, axis=1):
  548. if axis == 0:
  549. raise NotImplementedError
  550. indexer = np.asarray(indexer, dtype='i4')
  551. n = len(self.axes[axis])
  552. if ((indexer == -1) | (indexer >= n)).any():
  553. raise Exception('Indices must be nonzero and less than '
  554. 'the axis length')
  555. new_axes = list(self.axes)
  556. new_axes[axis] = self.axes[axis].take(indexer)
  557. new_blocks = []
  558. for blk in self.blocks:
  559. new_values = common.take_fast(blk.values, indexer,
  560. None, False, axis=axis)
  561. newb = make_block(new_values, blk.items, self.items)
  562. new_blocks.append(newb)
  563. return BlockManager(new_blocks, new_axes)
  564. def merge(self, other, lsuffix=None, rsuffix=None):
  565. assert(self._is_indexed_like(other))
  566. this, other = self._maybe_rename_join(other, lsuffix, rsuffix)
  567. cons_items = this.items + other.items
  568. consolidated = _consolidate(this.blocks + other.blocks, cons_items)
  569. new_axes = list(this.axes)
  570. new_axes[0] = cons_items
  571. return BlockManager(consolidated, new_axes)
  572. def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True):
  573. intersection = self.items.intersection(other.items)
  574. if len(intersection) > 0:
  575. if not lsuffix and not rsuffix:
  576. raise Exception('columns overlap: %s' % intersection)
  577. def lrenamer(x):
  578. if x in intersection:
  579. return '%s%s' % (x, lsuffix)
  580. return x
  581. def rrenamer(x):
  582. if x in intersection:
  583. return '%s%s' % (x, rsuffix)
  584. return x
  585. # XXX: COPIES DATA!
  586. this = self.rename_items(lrenamer, copydata=copydata)
  587. other = other.rename_items(rrenamer, copydata=copydata)
  588. else:
  589. this = self
  590. return this, other
  591. def _is_indexed_like(self, other):
  592. """
  593. Check all axes except items
  594. """
  595. assert(self.ndim == other.ndim)
  596. for ax, oax in zip(self.axes[1:], other.axes[1:]):
  597. if not ax.equals(oax):
  598. return False
  599. return True
  600. def join_on(self, other, on, how='left', axis=1, lsuffix=None,
  601. rsuffix=None):
  602. this, other = self._maybe_rename_join(other, lsuffix, rsuffix)
  603. other_axis = other.axes[axis]
  604. indexer = other_axis.get_indexer(on)
  605. if how == 'left':
  606. mask = indexer == -1
  607. needs_masking = len(on) > 0 and mask.any()
  608. else:
  609. mask = indexer != -1
  610. this = this.take(mask.nonzero()[0], axis=axis)
  611. indexer = indexer[mask]
  612. mask = None
  613. needs_masking = False
  614. other_blocks = []
  615. for block in other.blocks:
  616. newb = block.reindex_axis(indexer, mask, needs_masking, axis=axis)
  617. other_blocks.append(newb)
  618. cons_items = this.items + other.items
  619. consolidated = _consolidate(this.blocks + other_blocks, cons_items)
  620. new_axes = list(this.axes)
  621. new_axes[0] = cons_items
  622. return BlockManager(consolidated, new_axes)
  623. def rename_axis(self, mapper, axis=1):
  624. new_axis = Index([mapper(x) for x in self.axes[axis]])
  625. new_axis._verify_integrity()
  626. new_axes = list(self.axes)
  627. new_axes[axis] = new_axis
  628. return BlockManager(self.blocks, new_axes)
  629. def rename_items(self, mapper, copydata=True):
  630. new_items = Index([mapper(x) for x in self.items])
  631. new_items._verify_integrity()
  632. new_blocks = []
  633. for block in self.blocks:
  634. newb = block.copy(deep=copydata)
  635. newb.set_ref_items(new_items, maybe_rename=True)
  636. new_blocks.append(newb)
  637. new_axes = list(self.axes)
  638. new_axes[0] = new_items
  639. return BlockManager(new_blocks, new_axes)
  640. def add_prefix(self, prefix):
  641. f = (('%s' % prefix) + '%s').__mod__
  642. return self.rename_items(f)
  643. def add_suffix(self, suffix):
  644. f = ('%s' + ('%s' % suffix)).__mod__
  645. return self.rename_items(f)
  646. def fillna(self, value):
  647. """
  648. """
  649. new_blocks = [b.fillna(value) for b in self.blocks]
  650. return BlockManager(new_blocks, self.axes)
  651. @property
  652. def block_id_vector(self):
  653. # TODO
  654. result = np.empty(len(self.items), dtype=int)
  655. result.fill(-1)
  656. for i, blk in enumerate(self.blocks):
  657. indexer = self.items.get_indexer(blk.items)
  658. assert((indexer != -1).all())
  659. result.put(indexer, i)
  660. assert((result >= 0).all())
  661. return result
  662. def form_blocks(data, axes):
  663. # pre-filter out items if we passed it
  664. items = axes[0]
  665. if len(data) < len(items):
  666. extra_items = items - Index(data.keys())
  667. else:
  668. extra_items = []
  669. # put "leftover" items in float bucket, where else?
  670. # generalize?
  671. float_dict = {}
  672. int_dict = {}
  673. bool_dict = {}
  674. object_dict = {}
  675. for k, v in data.iteritems():
  676. if issubclass(v.dtype.type, np.floating):
  677. float_dict[k] = v
  678. elif issubclass(v.dtype.type, np.integer):
  679. int_dict[k] = v
  680. elif v.dtype == np.bool_:
  681. bool_dict[k] = v
  682. else:
  683. object_dict[k] = v
  684. blocks = []
  685. if len(float_dict):
  686. float_block = _simple_blockify(float_dict, items, np.float64)
  687. blocks.append(float_block)
  688. if len(int_dict):
  689. int_block = _simple_blockify(int_dict, items, np.int64)
  690. blocks.append(int_block)
  691. if len(bool_dict):
  692. bool_block = _simple_blockify(bool_dict, items, np.bool_)
  693. blocks.append(bool_block)
  694. if len(object_dict) > 0:
  695. object_block = _simple_blockify(object_dict, items, np.object_)
  696. blocks.append(object_block)
  697. if len(extra_items):
  698. shape = (len(extra_items),) + tuple(len(x) for x in axes[1:])
  699. block_values = np.empty(shape, dtype=float)
  700. block_values.fill(nan)
  701. na_block = make_block(block_values, extra_items, items,
  702. do_integrity_check=True)
  703. blocks.append(na_block)
  704. blocks = _consolidate(blocks, items)
  705. return blocks
  706. def _simple_blockify(dct, ref_items, dtype):
  707. block_items, values = _stack_dict(dct, ref_items)
  708. # CHECK DTYPE?
  709. if values.dtype != dtype: # pragma: no cover
  710. values = values.astype(dtype)
  711. return make_block(values, block_items, ref_items, do_integrity_check=True)
  712. def _stack_dict(dct, ref_items):
  713. from pandas.core.series import Series
  714. # fml
  715. def _asarray_compat(x):
  716. # asarray shouldn't be called on SparseSeries
  717. if isinstance(x, Series):
  718. return x.values
  719. else:
  720. return np.asarray(x)
  721. items = [x for x in ref_items if x in dct]
  722. stacked = np.vstack([_asarray_compat(dct[k]) for k in items])
  723. return items, stacked
  724. def _blocks_to_series_dict(blocks, index=None):
  725. from pandas.core.series import Series
  726. series_dict = {}
  727. for block in blocks:
  728. for item, vec in zip(block.items, block.values):
  729. series_dict[item] = Series(vec, index=index, name=item)
  730. return series_dict
  731. def _interleaved_dtype(blocks):
  732. from collections import defaultdict
  733. counts = defaultdict(lambda: 0)
  734. for x in blocks:
  735. counts[type(x)] += 1
  736. have_int = counts[IntBlock] > 0
  737. have_bool = counts[BoolBlock] > 0
  738. have_object = counts[ObjectBlock] > 0
  739. have_float = counts[FloatBlock] > 0
  740. have_numeric = have_float or have_int
  741. if have_object:
  742. return np.object_
  743. elif have_bool and have_numeric:
  744. return np.object_
  745. elif have_bool:
  746. return np.bool_
  747. elif have_int and not have_float:
  748. return np.int64
  749. else:
  750. return np.float64
  751. def _consolidate(blocks, items):
  752. """
  753. Merge blocks having same dtype
  754. """
  755. get_dtype = lambda x: x.dtype
  756. # sort by dtype
  757. grouper = itertools.groupby(sorted(blocks, key=get_dtype),
  758. lambda x: x.dtype)
  759. new_blocks = []
  760. for dtype, group_blocks in grouper:
  761. new_block = _merge_blocks(list(group_blocks), items)
  762. new_blocks.append(new_block)
  763. return new_blocks
  764. # TODO: this could be much optimized
  765. def _merge_blocks(blocks, items):
  766. if len(blocks) == 1:
  767. return blocks[0]
  768. new_values = np.vstack([b.values for b in blocks])
  769. new_items = blocks[0].items.append([b.items for b in blocks[1:]])
  770. new_block = make_block(new_values, new_items, items,
  771. do_integrity_check=True)
  772. return new_block.reindex_items_from(items)
  773. def _union_block_items(blocks):
  774. tot_len = 0
  775. all_items = []
  776. slow = False
  777. for b in blocks:
  778. tot_len += len(b.items)
  779. if type(b.items) != Index:
  780. slow = True
  781. all_items.append(b.items)
  782. if slow:
  783. the_union = _union_items_slow(all_items)
  784. else:
  785. the_union = Index(lib.fast_unique_multiple(all_items))
  786. if tot_len > len(the_union):
  787. raise Exception('item names overlap')
  788. return the_union
  789. def _union_items_slow(all_items):
  790. seen = None
  791. for items in all_items:
  792. if seen is None:
  793. seen = items
  794. else:
  795. seen = seen.union(items)
  796. return seen
  797. def join_managers(left, right, axis=1, how='left', copy=True):
  798. op = _JoinOperation(left, right, axis=axis, how=how)
  799. return op.get_result(copy=copy)
  800. class _JoinOperation(object):
  801. """
  802. Object responsible for orchestrating efficient join operation between two
  803. BlockManager data structures
  804. """
  805. def __init__(self, left, right, axis=1, how='left'):
  806. if not left.is_consolidated():
  807. left = left.consolidate()
  808. if not right.is_consolidated():
  809. right = right.consolidate()
  810. self.left = left
  811. self.right = right
  812. self.axis = axis
  813. self.how = how
  814. laxis = left.axes[axis]
  815. raxis = right.axes[axis]
  816. (self.join_index,
  817. self.lindexer,
  818. self.rindexer) = laxis.join(raxis, how=how, return_indexers=True)
  819. # do NOT sort
  820. self.result_items = left.items.append(right.items)
  821. self.result_axes = list(left.axes)
  822. self.result_axes[0] = self.result_items
  823. self.result_axes[axis] = self.join_index
  824. def get_result(self, copy=False):
  825. """
  826. Parameters
  827. ----------
  828. other
  829. lindexer
  830. lmask
  831. rindexer
  832. rmask
  833. Returns
  834. -------
  835. merged : BlockManager
  836. """
  837. left_blockmap, right_blockmap = self._prepare_blocks()
  838. result_blocks = []
  839. # maybe want to enable flexible copying
  840. kinds = set(left_blockmap) | set(right_blockmap)
  841. for klass in kinds:
  842. lblk = left_blockmap.get(klass)
  843. rblk = right_blockmap.get(klass)
  844. if lblk and rblk:
  845. # true merge, do not produce intermediate copy
  846. res_blk = self._merge_blocks(lblk, rblk)
  847. elif lblk:
  848. res_blk = self._reindex_block(lblk, side='left')
  849. else:
  850. res_blk = self._reindex_block(rblk, side='right')
  851. result_blocks.append(res_blk)
  852. return BlockManager(result_blocks, self.result_axes)
  853. def _prepare_blocks(self):
  854. lblocks = self.left.blocks
  855. rblocks = self.right.blocks
  856. # will short-circuit and not compute lneed_masking
  857. if self.lneed_masking:
  858. lblocks = self._upcast_blocks(lblocks)
  859. if self.rneed_masking:
  860. rblocks = self._upcast_blocks(rblocks)
  861. left_blockmap = dict((type(blk), blk) for blk in lblocks)
  862. right_blockmap = dict((type(blk), blk) for blk in rblocks)
  863. return left_blockmap, right_blockmap
  864. def _reindex_block(self, block, side='left', copy=True):
  865. if side == 'left':
  866. indexer = self.lindexer
  867. mask, need_masking = self.lmask_info
  868. else:
  869. indexer = self.rindexer
  870. mask, need_masking = self.rmask_info
  871. # still some inefficiency here for bool/int64 because in the case where
  872. # no masking is needed, take_fast will recompute the mask
  873. if indexer is None and copy:
  874. result = block.copy()
  875. else:
  876. result = block.reindex_axis(indexer, mask, need_masking,
  877. axis=self.axis)
  878. result.ref_items = self.result_items
  879. return result
  880. @cache_readonly
  881. def lmask_info(self):
  882. if (self.lindexer is None or
  883. not self._may_need_upcasting(self.left.blocks)):
  884. lmask = None
  885. lneed_masking = False
  886. else:
  887. lmask = self.lindexer == -1
  888. lneed_masking = lmask.any()
  889. return lmask, lneed_masking
  890. @cache_readonly
  891. def rmask_info(self):
  892. if (self.rindexer is None or
  893. not self._may_need_upcasting(self.right.blocks)):
  894. rmask = None
  895. rneed_masking = False
  896. else:
  897. rmask = self.rindexer == -1
  898. rneed_masking = rmask.any()
  899. return rmask, rneed_masking
  900. @property
  901. def lneed_masking(self):
  902. return self.lmask_info[1]
  903. @property
  904. def rneed_masking(self):
  905. return self.rmask_info[1]
  906. @staticmethod
  907. def _may_need_upcasting(blocks):
  908. for block in blocks:
  909. if isinstance(block, (IntBlock, BoolBlock)):
  910. return True
  911. return False
  912. def _merge_blocks(self, lblk, rblk):
  913. lidx = self.lindexer
  914. ridx = self.rindexer
  915. n = lblk.values.shape[self.axis] if lidx is None else len(lidx)
  916. lk = len(lblk.items)
  917. rk = len(rblk.items)
  918. out_shape = list(lblk.shape)
  919. out_shape[0] = lk + rk
  920. out_shape[self.axis] = n
  921. out = np.empty(out_shape, dtype=lblk.values.dtype)
  922. # is this really faster than assigning to arr.flat?
  923. if lidx is None:
  924. # out[:lk] = lblk.values
  925. common.take_fast(lblk.values, np.arange(n, dtype='i4'),
  926. None, False,
  927. axis=self.axis, out=out[:lk])
  928. else:
  929. # write out the values to the result array
  930. common.take_fast(lblk.values, lidx, None, False,
  931. axis=self.axis, out=out[:lk])
  932. if ridx is None:
  933. # out[lk:] = lblk.values
  934. common.take_fast(rblk.values, np.arange(n, dtype='i4'),
  935. None, False,
  936. axis=self.axis, out=out[lk:])
  937. else:
  938. common.take_fast(rblk.values, ridx, None, False,
  939. axis=self.axis, out=out[lk:])
  940. # does not sort
  941. new_items = lblk.items.append(rblk.items)
  942. return make_block(out, new_items, self.result_items)
  943. @staticmethod
  944. def _upcast_blocks(blocks):
  945. """
  946. Upcast and consolidate if necessary
  947. """
  948. # if not need_masking:
  949. # return blocks
  950. new_blocks = []
  951. for block in blocks:
  952. if isinstance(block, IntBlock):
  953. newb = make_block(block.values.astype(float), block.items,
  954. block.ref_items)
  955. elif isinstance(block, BoolBlock):
  956. newb = make_block(block.values.astype(object), block.items,
  957. block.ref_items)
  958. else:
  959. newb = block
  960. new_blocks.append(newb)
  961. # use any ref_items
  962. return _consolidate(new_blocks, newb.ref_items)