PageRenderTime 58ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/core/internals.py

https://github.com/ara818/pandas
Python | 1170 lines | 1034 code | 26 blank | 110 comment | 11 complexity | 710b8cd662bac1c7c7a672d6d07a2505 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import itertools
  2. from numpy import nan
  3. import numpy as np
  4. from pandas.core.index import Index, _ensure_index
  5. from pandas.util.decorators import cache_readonly
  6. import pandas.core.common as common
  7. import pandas._tseries as lib
  8. class Block(object):
  9. """
  10. Canonical n-dimensional unit of homogeneous dtype contained in a pandas data
  11. structure
  12. Index-ignorant; let the container take care of that
  13. """
  14. __slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim']
  15. def __init__(self, values, items, ref_items, ndim=2,
  16. do_integrity_check=False):
  17. if issubclass(values.dtype.type, basestring):
  18. values = np.array(values, dtype=object)
  19. assert(values.ndim == ndim)
  20. assert(len(items) == len(values))
  21. self.values = values
  22. self.ndim = ndim
  23. self.items = _ensure_index(items)
  24. self.ref_items = _ensure_index(ref_items)
  25. if do_integrity_check:
  26. self._check_integrity()
  27. def _check_integrity(self):
  28. if len(self.items) < 2:
  29. return
  30. # monotonicity
  31. return (self.ref_locs[1:] > self.ref_locs[:-1]).all()
  32. _ref_locs = None
  33. @property
  34. def ref_locs(self):
  35. if self._ref_locs is None:
  36. indexer = self.ref_items.get_indexer(self.items)
  37. assert((indexer != -1).all())
  38. self._ref_locs = indexer
  39. return self._ref_locs
  40. def set_ref_items(self, ref_items, maybe_rename=True):
  41. """
  42. If maybe_rename=True, need to set the items for this guy
  43. """
  44. assert(isinstance(ref_items, Index))
  45. if maybe_rename:
  46. self.items = ref_items.take(self.ref_locs)
  47. self.ref_items = ref_items
  48. def __repr__(self):
  49. shape = ' x '.join([str(s) for s in self.shape])
  50. name = type(self).__name__
  51. return '%s: %s, %s, dtype %s' % (name, self.items, shape, self.dtype)
  52. def __contains__(self, item):
  53. return item in self.items
  54. def __len__(self):
  55. return len(self.values)
  56. def __getstate__(self):
  57. # should not pickle generally (want to share ref_items), but here for
  58. # completeness
  59. return (self.items, self.ref_items, self.values)
  60. def __setstate__(self, state):
  61. items, ref_items, values = state
  62. self.items = _ensure_index(items)
  63. self.ref_items = _ensure_index(ref_items)
  64. self.values = values
  65. self.ndim = values.ndim
  66. @property
  67. def shape(self):
  68. return self.values.shape
  69. @property
  70. def dtype(self):
  71. return self.values.dtype
  72. def copy(self, deep=True):
  73. values = self.values
  74. if deep:
  75. values = values.copy()
  76. return make_block(values, self.items, self.ref_items)
  77. def merge(self, other):
  78. assert(self.ref_items.equals(other.ref_items))
  79. # Not sure whether to allow this or not
  80. # if not union_ref.equals(other.ref_items):
  81. # union_ref = self.ref_items + other.ref_items
  82. return _merge_blocks([self, other], self.ref_items)
  83. def reindex_axis(self, indexer, mask, needs_masking, axis=0):
  84. """
  85. Reindex using pre-computed indexer information
  86. """
  87. if self.values.size > 0:
  88. new_values = common.take_fast(self.values, indexer, mask,
  89. needs_masking, axis=axis)
  90. else:
  91. shape = list(self.shape)
  92. shape[axis] = len(indexer)
  93. new_values = np.empty(shape)
  94. new_values.fill(np.nan)
  95. return make_block(new_values, self.items, self.ref_items)
  96. def reindex_items_from(self, new_ref_items):
  97. """
  98. Reindex to only those items contained in the input set of items
  99. E.g. if you have ['a', 'b'], and the input items is ['b', 'c', 'd'],
  100. then the resulting items will be ['b']
  101. Returns
  102. -------
  103. reindexed : Block
  104. """
  105. new_ref_items, indexer = self.items.reindex(new_ref_items)
  106. mask = indexer != -1
  107. masked_idx = indexer[mask]
  108. new_values = self.values.take(masked_idx, axis=0)
  109. new_items = self.items.take(masked_idx)
  110. return make_block(new_values, new_items, new_ref_items)
  111. def get(self, item):
  112. loc = self.items.get_loc(item)
  113. return self.values[loc]
  114. def set(self, item, value):
  115. """
  116. Modify Block in-place with new item value
  117. Returns
  118. -------
  119. None
  120. """
  121. loc = self.items.get_loc(item)
  122. self.values[loc] = value
  123. def delete(self, item):
  124. """
  125. Returns
  126. -------
  127. y : Block (new object)
  128. """
  129. loc = self.items.get_loc(item)
  130. new_items = self.items.delete(loc)
  131. new_values = np.delete(self.values, loc, 0)
  132. return make_block(new_values, new_items, self.ref_items)
  133. def fillna(self, value):
  134. new_values = self.values.copy()
  135. mask = common.isnull(new_values.ravel())
  136. new_values.flat[mask] = value
  137. return make_block(new_values, self.items, self.ref_items)
  138. #-------------------------------------------------------------------------------
  139. # Is this even possible?
  140. class FloatBlock(Block):
  141. def can_store(self, value):
  142. return issubclass(value.dtype.type, (np.integer, np.floating))
  143. class IntBlock(Block):
  144. def can_store(self, value):
  145. return issubclass(value.dtype.type, np.integer)
  146. class BoolBlock(Block):
  147. def can_store(self, value):
  148. return issubclass(value.dtype.type, np.bool_)
  149. class ObjectBlock(Block):
  150. def can_store(self, value):
  151. return not issubclass(value.dtype.type,
  152. (np.integer, np.floating, np.bool_))
  153. def make_block(values, items, ref_items, do_integrity_check=False):
  154. dtype = values.dtype
  155. vtype = dtype.type
  156. if issubclass(vtype, np.floating):
  157. klass = FloatBlock
  158. elif issubclass(vtype, np.integer):
  159. klass = IntBlock
  160. elif dtype == np.bool_:
  161. klass = BoolBlock
  162. else:
  163. klass = ObjectBlock
  164. return klass(values, items, ref_items, ndim=values.ndim,
  165. do_integrity_check=do_integrity_check)
  166. # TODO: flexible with index=None and/or items=None
  167. class BlockManager(object):
  168. """
  169. Core internal data structure to implement DataFrame
  170. Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
  171. lightweight blocked set of labeled data to be manipulated by the DataFrame
  172. public API class
  173. Parameters
  174. ----------
  175. Notes
  176. -----
  177. This is *not* a public API class
  178. """
  179. __slots__ = ['axes', 'blocks', 'ndim']
  180. def __init__(self, blocks, axes, do_integrity_check=True):
  181. self.axes = [_ensure_index(ax) for ax in axes]
  182. self.blocks = blocks
  183. ndim = len(axes)
  184. for block in blocks:
  185. assert(ndim == block.values.ndim)
  186. if do_integrity_check:
  187. self._verify_integrity()
  188. @property
  189. def ndim(self):
  190. return len(self.axes)
  191. def is_mixed_dtype(self):
  192. counts = set()
  193. for block in self.blocks:
  194. counts.add(block.dtype)
  195. if len(counts) > 1:
  196. return True
  197. return False
  198. def set_axis(self, axis, value):
  199. cur_axis = self.axes[axis]
  200. if len(value) != len(cur_axis):
  201. raise Exception('Length mismatch (%d vs %d)'
  202. % (len(value), len(cur_axis)))
  203. self.axes[axis] = _ensure_index(value)
  204. if axis == 0:
  205. for block in self.blocks:
  206. block.set_ref_items(self.items, maybe_rename=True)
  207. # make items read only for now
  208. def _get_items(self):
  209. return self.axes[0]
  210. items = property(fget=_get_items)
  211. def set_items_norename(self, value):
  212. value = _ensure_index(value)
  213. self.axes[0] = value
  214. for block in self.blocks:
  215. block.set_ref_items(value, maybe_rename=False)
  216. def __getstate__(self):
  217. block_values = [b.values for b in self.blocks]
  218. block_items = [b.items for b in self.blocks]
  219. axes_array = [ax for ax in self.axes]
  220. return axes_array, block_values, block_items
  221. def __setstate__(self, state):
  222. # discard anything after 3rd, support beta pickling format for a little
  223. # while longer
  224. ax_arrays, bvalues, bitems = state[:3]
  225. self.axes = [_ensure_index(ax) for ax in ax_arrays]
  226. blocks = []
  227. for values, items in zip(bvalues, bitems):
  228. blk = make_block(values, items, self.axes[0],
  229. do_integrity_check=True)
  230. blocks.append(blk)
  231. self.blocks = blocks
  232. def __len__(self):
  233. return len(self.items)
  234. def __repr__(self):
  235. output = 'BlockManager'
  236. for i, ax in enumerate(self.axes):
  237. if i == 0:
  238. output += '\nItems: %s' % ax
  239. else:
  240. output += '\nAxis %d: %s' % (i, ax)
  241. for block in self.blocks:
  242. output += '\n%s' % repr(block)
  243. return output
  244. @property
  245. def shape(self):
  246. return tuple(len(ax) for ax in self.axes)
  247. def _verify_integrity(self):
  248. union_items = _union_block_items(self.blocks)
  249. mgr_shape = self.shape
  250. for block in self.blocks:
  251. assert(block.values.shape[1:] == mgr_shape[1:])
  252. tot_items = sum(len(x.items) for x in self.blocks)
  253. assert(len(self.items) == tot_items)
  254. def astype(self, dtype):
  255. new_blocks = []
  256. for block in self.blocks:
  257. newb = make_block(block.values.astype(dtype), block.items,
  258. block.ref_items)
  259. new_blocks.append(newb)
  260. new_mgr = BlockManager(new_blocks, self.axes)
  261. return new_mgr.consolidate()
  262. def is_consolidated(self):
  263. """
  264. Return True if more than one block with the same dtype
  265. """
  266. dtypes = [blk.dtype for blk in self.blocks]
  267. return len(dtypes) == len(set(dtypes))
  268. def get_slice(self, slobj, axis=0):
  269. new_axes = list(self.axes)
  270. new_axes[axis] = new_axes[axis][slobj]
  271. if axis == 0:
  272. new_items = new_axes[0]
  273. if len(self.blocks) == 1:
  274. blk = self.blocks[0]
  275. newb = make_block(blk.values[slobj], new_items,
  276. new_items)
  277. new_blocks = [newb]
  278. else:
  279. return self.reindex_items(new_items)
  280. else:
  281. new_blocks = self._slice_blocks(slobj, axis)
  282. return BlockManager(new_blocks, new_axes, do_integrity_check=False)
  283. def _slice_blocks(self, slobj, axis):
  284. new_blocks = []
  285. slicer = [slice(None, None) for _ in range(self.ndim)]
  286. slicer[axis] = slobj
  287. slicer = tuple(slicer)
  288. for block in self.blocks:
  289. newb = make_block(block.values[slicer], block.items,
  290. block.ref_items)
  291. new_blocks.append(newb)
  292. return new_blocks
  293. def get_series_dict(self):
  294. # For DataFrame
  295. return _blocks_to_series_dict(self.blocks, self.axes[1])
  296. @classmethod
  297. def from_blocks(cls, blocks, index):
  298. # also checks for overlap
  299. items = _union_block_items(blocks)
  300. return BlockManager(blocks, [items, index])
  301. def __contains__(self, item):
  302. return item in self.items
  303. @property
  304. def nblocks(self):
  305. return len(self.blocks)
  306. def copy(self, deep=True):
  307. """
  308. Make deep or shallow copy of BlockManager
  309. Parameters
  310. ----------
  311. deep : boolean, default True
  312. If False, return shallow copy (do not copy data)
  313. Returns
  314. -------
  315. copy : BlockManager
  316. """
  317. copy_blocks = [block.copy(deep=deep) for block in self.blocks]
  318. return BlockManager(copy_blocks, self.axes)
  319. def as_matrix(self, items=None):
  320. if len(self.blocks) == 0:
  321. mat = np.empty(self.shape, dtype=float)
  322. elif len(self.blocks) == 1:
  323. blk = self.blocks[0]
  324. if items is None or blk.items.equals(items):
  325. # if not, then just call interleave per below
  326. mat = blk.values
  327. else:
  328. mat = self.reindex_items(items).as_matrix()
  329. else:
  330. if items is None:
  331. mat = self._interleave(self.items)
  332. else:
  333. mat = self.reindex_items(items).as_matrix()
  334. return mat
  335. def _interleave(self, items):
  336. """
  337. Return ndarray from blocks with specified item order
  338. Items must be contained in the blocks
  339. """
  340. dtype = _interleaved_dtype(self.blocks)
  341. items = _ensure_index(items)
  342. result = np.empty(self.shape, dtype=dtype)
  343. itemmask = np.zeros(len(items), dtype=bool)
  344. # By construction, all of the item should be covered by one of the
  345. # blocks
  346. for block in self.blocks:
  347. indexer = items.get_indexer(block.items)
  348. assert((indexer != -1).all())
  349. result[indexer] = block.values
  350. itemmask[indexer] = 1
  351. assert(itemmask.all())
  352. return result
  353. def xs(self, key, axis=1, copy=True):
  354. assert(axis >= 1)
  355. loc = self.axes[axis].get_loc(key)
  356. slicer = [slice(None, None) for _ in range(self.ndim)]
  357. slicer[axis] = loc
  358. slicer = tuple(slicer)
  359. new_axes = list(self.axes)
  360. # could be an array indexer!
  361. if isinstance(loc, (slice, np.ndarray)):
  362. new_axes[axis] = new_axes[axis][loc]
  363. else:
  364. new_axes.pop(axis)
  365. new_blocks = []
  366. if len(self.blocks) > 1:
  367. if not copy:
  368. raise Exception('cannot get view of mixed-type or '
  369. 'non-consolidated DataFrame')
  370. for blk in self.blocks:
  371. newb = make_block(blk.values[slicer], blk.items, blk.ref_items)
  372. new_blocks.append(newb)
  373. elif len(self.blocks) == 1:
  374. vals = self.blocks[0].values[slicer]
  375. if copy:
  376. vals = vals.copy()
  377. new_blocks = [make_block(vals, self.items, self.items)]
  378. return BlockManager(new_blocks, new_axes)
  379. def fast_2d_xs(self, loc, copy=False):
  380. """
  381. """
  382. if len(self.blocks) == 1:
  383. result = self.blocks[0].values[:, loc]
  384. if copy:
  385. result = result.copy()
  386. return result
  387. if not copy:
  388. raise Exception('cannot get view of mixed-type or '
  389. 'non-consolidated DataFrame')
  390. dtype = _interleaved_dtype(self.blocks)
  391. items = self.items
  392. n = len(items)
  393. result = np.empty(n, dtype=dtype)
  394. for blk in self.blocks:
  395. values = blk.values
  396. for j, item in enumerate(blk.items):
  397. i = items.get_loc(item)
  398. result[i] = values[j, loc]
  399. return result
  400. def consolidate(self):
  401. """
  402. Join together blocks having same dtype
  403. Returns
  404. -------
  405. y : BlockManager
  406. """
  407. if self.is_consolidated():
  408. return self
  409. new_blocks = _consolidate(self.blocks, self.items)
  410. return BlockManager(new_blocks, self.axes)
  411. def get(self, item):
  412. _, block = self._find_block(item)
  413. return block.get(item)
  414. def delete(self, item):
  415. i, _ = self._find_block(item)
  416. loc = self.items.get_loc(item)
  417. new_items = Index(np.delete(np.asarray(self.items), loc))
  418. self._delete_from_block(i, item)
  419. self.set_items_norename(new_items)
  420. def set(self, item, value):
  421. """
  422. Set new item in-place. Does not consolidate. Adds new Block if not
  423. contained in the current set of items
  424. """
  425. if value.ndim == self.ndim - 1:
  426. value = value.reshape((1,) + value.shape)
  427. assert(value.shape[1:] == self.shape[1:])
  428. if item in self.items:
  429. i, block = self._find_block(item)
  430. if not block.can_store(value):
  431. # delete from block, create and append new block
  432. self._delete_from_block(i, item)
  433. self._add_new_block(item, value)
  434. else:
  435. block.set(item, value)
  436. else:
  437. # insert at end
  438. self.insert(len(self.items), item, value)
  439. def insert(self, loc, item, value):
  440. if item in self.items:
  441. raise Exception('cannot insert %s, already exists' % item)
  442. new_items = self.items.insert(loc, item)
  443. self.set_items_norename(new_items)
  444. # new block
  445. self._add_new_block(item, value)
  446. def _delete_from_block(self, i, item):
  447. """
  448. Delete and maybe remove the whole block
  449. """
  450. block = self.blocks[i]
  451. newb = block.delete(item)
  452. if len(newb.ref_locs) == 0:
  453. self.blocks.pop(i)
  454. else:
  455. self.blocks[i] = newb
  456. def _add_new_block(self, item, value):
  457. # Do we care about dtype at the moment?
  458. # hm, elaborate hack?
  459. loc = self.items.get_loc(item)
  460. new_block = make_block(value, self.items[loc:loc+1], self.items)
  461. self.blocks.append(new_block)
  462. def _find_block(self, item):
  463. self._check_have(item)
  464. for i, block in enumerate(self.blocks):
  465. if item in block:
  466. return i, block
  467. def _check_have(self, item):
  468. if item not in self.items:
  469. raise KeyError('no item named %s' % str(item))
  470. def reindex_axis(self, new_axis, method=None, axis=0):
  471. if axis == 0:
  472. assert(method is None)
  473. return self.reindex_items(new_axis)
  474. new_axis = _ensure_index(new_axis)
  475. cur_axis = self.axes[axis]
  476. new_axis, indexer = cur_axis.reindex(new_axis, method)
  477. mask = indexer == -1
  478. # TODO: deal with length-0 case? or does it fall out?
  479. needs_masking = len(new_axis) > 0 and mask.any()
  480. new_blocks = []
  481. for block in self.blocks:
  482. newb = block.reindex_axis(indexer, mask, needs_masking,
  483. axis=axis)
  484. new_blocks.append(newb)
  485. new_axes = list(self.axes)
  486. new_axes[axis] = new_axis
  487. return BlockManager(new_blocks, new_axes)
  488. def reindex_indexer(self, new_axis, indexer, axis=1):
  489. """
  490. pandas-indexer with -1's only
  491. """
  492. if axis == 0:
  493. raise NotImplementedError
  494. new_axes = list(self.axes)
  495. new_axes[axis] = new_axis
  496. new_blocks = []
  497. for blk in self.blocks:
  498. new_values = common.take_fast(blk.values, indexer, None,
  499. False, axis=axis)
  500. newb = make_block(new_values, blk.items, self.items)
  501. new_blocks.append(newb)
  502. return BlockManager(new_blocks, new_axes)
  503. def reindex_items(self, new_items):
  504. """
  505. """
  506. new_items = _ensure_index(new_items)
  507. data = self
  508. if not data.is_consolidated():
  509. data = data.consolidate()
  510. return data.reindex_items(new_items)
  511. # TODO: this part could be faster (!)
  512. new_items, indexer = self.items.reindex(new_items)
  513. mask = indexer == -1
  514. new_blocks = []
  515. for block in self.blocks:
  516. newb = block.reindex_items_from(new_items)
  517. if len(newb.items) > 0:
  518. new_blocks.append(newb)
  519. if mask.any():
  520. extra_items = new_items[mask]
  521. block_shape = list(self.shape)
  522. block_shape[0] = len(extra_items)
  523. block_values = np.empty(block_shape, dtype=np.float64)
  524. block_values.fill(nan)
  525. na_block = make_block(block_values, extra_items, new_items,
  526. do_integrity_check=True)
  527. new_blocks.append(na_block)
  528. new_blocks = _consolidate(new_blocks, new_items)
  529. new_axes = list(self.axes)
  530. new_axes[0] = new_items
  531. return BlockManager(new_blocks, new_axes)
  532. def take(self, indexer, axis=1, pandas_indexer=False):
  533. if axis == 0:
  534. raise NotImplementedError
  535. if pandas_indexer:
  536. take_f = lambda arr: common.take_fast(arr, indexer,
  537. None, False, axis=axis)
  538. else:
  539. take_f = lambda arr: arr.take(indexer, axis=axis)
  540. new_axes = list(self.axes)
  541. new_axes[axis] = self.axes[axis].take(indexer)
  542. new_blocks = []
  543. for blk in self.blocks:
  544. newb = make_block(take_f(blk.values), blk.items, self.items)
  545. new_blocks.append(newb)
  546. return BlockManager(new_blocks, new_axes)
  547. def merge(self, other, lsuffix=None, rsuffix=None):
  548. assert(self._is_indexed_like(other))
  549. this, other = self._maybe_rename_join(other, lsuffix, rsuffix)
  550. cons_items = this.items + other.items
  551. consolidated = _consolidate(this.blocks + other.blocks, cons_items)
  552. new_axes = list(this.axes)
  553. new_axes[0] = cons_items
  554. return BlockManager(consolidated, new_axes)
  555. def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True):
  556. intersection = self.items.intersection(other.items)
  557. if len(intersection) > 0:
  558. if not lsuffix and not rsuffix:
  559. raise Exception('columns overlap: %s' % intersection)
  560. def lrenamer(x):
  561. if x in intersection:
  562. return '%s%s' % (x, lsuffix)
  563. return x
  564. def rrenamer(x):
  565. if x in intersection:
  566. return '%s%s' % (x, rsuffix)
  567. return x
  568. # XXX: COPIES DATA!
  569. this = self.rename_items(lrenamer, copydata=copydata)
  570. other = other.rename_items(rrenamer, copydata=copydata)
  571. else:
  572. this = self
  573. return this, other
  574. def _is_indexed_like(self, other):
  575. """
  576. Check all axes except items
  577. """
  578. assert(self.ndim == other.ndim)
  579. for ax, oax in zip(self.axes[1:], other.axes[1:]):
  580. if not ax.equals(oax):
  581. return False
  582. return True
  583. def join_on(self, other, on, axis=1, lsuffix=None, rsuffix=None):
  584. this, other = self._maybe_rename_join(other, lsuffix, rsuffix)
  585. other_axis = other.axes[axis]
  586. indexer = other_axis.get_indexer(on)
  587. # TODO: deal with length-0 case? or does it fall out?
  588. mask = indexer == -1
  589. needs_masking = len(on) > 0 and mask.any()
  590. other_blocks = []
  591. for block in other.blocks:
  592. newb = block.reindex_axis(indexer, mask, needs_masking, axis=axis)
  593. other_blocks.append(newb)
  594. cons_items = this.items + other.items
  595. consolidated = _consolidate(this.blocks + other_blocks, cons_items)
  596. new_axes = list(this.axes)
  597. new_axes[0] = cons_items
  598. return BlockManager(consolidated, new_axes)
  599. def rename_axis(self, mapper, axis=1):
  600. new_axis = Index([mapper(x) for x in self.axes[axis]])
  601. new_axis._verify_integrity()
  602. new_axes = list(self.axes)
  603. new_axes[axis] = new_axis
  604. return BlockManager(self.blocks, new_axes)
  605. def rename_items(self, mapper, copydata=True):
  606. new_items = Index([mapper(x) for x in self.items])
  607. new_items._verify_integrity()
  608. new_blocks = []
  609. for block in self.blocks:
  610. newb = block.copy(deep=copydata)
  611. newb.set_ref_items(new_items, maybe_rename=True)
  612. new_blocks.append(newb)
  613. new_axes = list(self.axes)
  614. new_axes[0] = new_items
  615. return BlockManager(new_blocks, new_axes)
  616. def add_prefix(self, prefix):
  617. f = (('%s' % prefix) + '%s').__mod__
  618. return self.rename_items(f)
  619. def add_suffix(self, suffix):
  620. f = ('%s' + ('%s' % suffix)).__mod__
  621. return self.rename_items(f)
  622. def fillna(self, value):
  623. """
  624. """
  625. new_blocks = [b.fillna(value) for b in self.blocks]
  626. return BlockManager(new_blocks, self.axes)
  627. @property
  628. def block_id_vector(self):
  629. # TODO
  630. result = np.empty(len(self.items), dtype=int)
  631. result.fill(-1)
  632. for i, blk in enumerate(self.blocks):
  633. indexer = self.items.get_indexer(blk.items)
  634. assert((indexer != -1).all())
  635. result.put(indexer, i)
  636. assert((result >= 0).all())
  637. return result
  638. _data_types = [np.float_, np.int_]
  639. def form_blocks(data, axes):
  640. # pre-filter out items if we passed it
  641. items = axes[0]
  642. if len(data) < len(items):
  643. extra_items = items - Index(data.keys())
  644. else:
  645. extra_items = []
  646. # put "leftover" items in float bucket, where else?
  647. # generalize?
  648. float_dict = {}
  649. int_dict = {}
  650. bool_dict = {}
  651. object_dict = {}
  652. for k, v in data.iteritems():
  653. if issubclass(v.dtype.type, np.floating):
  654. float_dict[k] = v
  655. elif issubclass(v.dtype.type, np.integer):
  656. int_dict[k] = v
  657. elif v.dtype == np.bool_:
  658. bool_dict[k] = v
  659. else:
  660. object_dict[k] = v
  661. blocks = []
  662. if len(float_dict):
  663. float_block = _simple_blockify(float_dict, items, np.float64)
  664. blocks.append(float_block)
  665. if len(int_dict):
  666. int_block = _simple_blockify(int_dict, items, np.int_)
  667. blocks.append(int_block)
  668. if len(bool_dict):
  669. bool_block = _simple_blockify(bool_dict, items, np.bool_)
  670. blocks.append(bool_block)
  671. if len(object_dict) > 0:
  672. object_block = _simple_blockify(object_dict, items, np.object_)
  673. blocks.append(object_block)
  674. if len(extra_items):
  675. shape = (len(extra_items),) + tuple(len(x) for x in axes[1:])
  676. block_values = np.empty(shape, dtype=float)
  677. block_values.fill(nan)
  678. na_block = make_block(block_values, extra_items, items,
  679. do_integrity_check=True)
  680. blocks.append(na_block)
  681. blocks = _consolidate(blocks, items)
  682. return blocks
  683. def _simple_blockify(dct, ref_items, dtype):
  684. block_items, values = _stack_dict(dct, ref_items)
  685. # CHECK DTYPE?
  686. if values.dtype != dtype: # pragma: no cover
  687. values = values.astype(dtype)
  688. return make_block(values, block_items, ref_items, do_integrity_check=True)
  689. def _stack_dict(dct, ref_items):
  690. items = [x for x in ref_items if x in dct]
  691. stacked = np.vstack([np.asarray(dct[k]) for k in items])
  692. return items, stacked
  693. def _blocks_to_series_dict(blocks, index=None):
  694. from pandas.core.series import Series
  695. series_dict = {}
  696. for block in blocks:
  697. for item, vec in zip(block.items, block.values):
  698. series_dict[item] = Series(vec, index=index, name=item)
  699. return series_dict
  700. def _interleaved_dtype(blocks):
  701. have_int = False
  702. have_bool = False
  703. have_object = False
  704. have_float = False
  705. for block in blocks:
  706. if isinstance(block, FloatBlock):
  707. have_float = True
  708. elif isinstance(block, IntBlock):
  709. have_int = True
  710. elif isinstance(block, BoolBlock):
  711. have_bool = True
  712. elif isinstance(block, ObjectBlock):
  713. have_object = True
  714. else: # pragma: no cover
  715. raise Exception('Unrecognized block type')
  716. have_numeric = have_float or have_int
  717. if have_object:
  718. return np.object_
  719. elif have_bool and have_numeric:
  720. return np.object_
  721. elif have_bool:
  722. return np.bool_
  723. elif have_int and not have_float:
  724. return np.int64
  725. else:
  726. return np.float64
  727. def _consolidate(blocks, items):
  728. """
  729. Merge blocks having same dtype
  730. """
  731. get_dtype = lambda x: x.dtype
  732. # sort by dtype
  733. grouper = itertools.groupby(sorted(blocks, key=get_dtype),
  734. lambda x: x.dtype)
  735. new_blocks = []
  736. for dtype, group_blocks in grouper:
  737. new_block = _merge_blocks(list(group_blocks), items)
  738. new_blocks.append(new_block)
  739. return new_blocks
  740. # TODO: this could be much optimized
  741. def _merge_blocks(blocks, items):
  742. if len(blocks) == 1:
  743. return blocks[0]
  744. new_values = np.vstack([b.values for b in blocks])
  745. new_items = np.concatenate([b.items for b in blocks])
  746. new_block = make_block(new_values, new_items, items,
  747. do_integrity_check=True)
  748. return new_block.reindex_items_from(items)
  749. def _union_block_items(blocks):
  750. tot_len = 0
  751. all_items = []
  752. slow = False
  753. for b in blocks:
  754. tot_len += len(b.items)
  755. if type(b.items) != Index:
  756. slow = True
  757. all_items.append(b.items)
  758. if slow:
  759. the_union = _union_items_slow(all_items)
  760. else:
  761. the_union = Index(lib.fast_unique_multiple(all_items))
  762. if tot_len > len(the_union):
  763. raise Exception('item names overlap')
  764. return the_union
  765. def _union_items_slow(all_items):
  766. seen = None
  767. for items in all_items:
  768. if seen is None:
  769. seen = items
  770. else:
  771. seen = seen.union(items)
  772. return seen
  773. def join_managers(left, right, axis=1, how='left', copy=True):
  774. op = _JoinOperation(left, right, axis=axis, how=how)
  775. return op.get_result(copy=copy)
  776. class _JoinOperation(object):
  777. """
  778. Object responsible for orchestrating efficient join operation between two
  779. BlockManager data structures
  780. """
  781. def __init__(self, left, right, axis=1, how='left'):
  782. self.left = left
  783. self.right = right
  784. self.axis = axis
  785. self.how = how
  786. assert(left.is_consolidated())
  787. assert(right.is_consolidated())
  788. laxis = left.axes[axis]
  789. raxis = right.axes[axis]
  790. (self.join_index,
  791. self.lindexer,
  792. self.rindexer) = laxis.join(raxis, how=how, return_indexers=True)
  793. # do NOT sort
  794. self.result_items = left.items.append(right.items)
  795. self.result_axes = list(left.axes)
  796. self.result_axes[0] = self.result_items
  797. self.result_axes[axis] = self.join_index
  798. def get_result(self, copy=False):
  799. """
  800. Parameters
  801. ----------
  802. other
  803. lindexer
  804. lmask
  805. rindexer
  806. rmask
  807. Returns
  808. -------
  809. merged : BlockManager
  810. """
  811. left_blockmap, right_blockmap = self._prepare_blocks()
  812. result_blocks = []
  813. # maybe want to enable flexible copying
  814. kinds = set(left_blockmap) | set(right_blockmap)
  815. for klass in kinds:
  816. lblk = left_blockmap.get(klass)
  817. rblk = right_blockmap.get(klass)
  818. if lblk and rblk:
  819. # true merge, do not produce intermediate copy
  820. res_blk = self._merge_blocks(lblk, rblk)
  821. elif lblk:
  822. res_blk = self._reindex_block(lblk, side='left')
  823. else:
  824. res_blk = self._reindex_block(rblk, side='right')
  825. result_blocks.append(res_blk)
  826. return BlockManager(result_blocks, self.result_axes)
  827. def _prepare_blocks(self):
  828. lblocks = self.left.blocks
  829. rblocks = self.right.blocks
  830. # will short-circuit and not compute lneed_masking
  831. if self.lneed_masking:
  832. lblocks = self._upcast_blocks(lblocks)
  833. if self.rneed_masking:
  834. rblocks = self._upcast_blocks(rblocks)
  835. left_blockmap = dict((type(blk), blk) for blk in lblocks)
  836. right_blockmap = dict((type(blk), blk) for blk in rblocks)
  837. return left_blockmap, right_blockmap
  838. def _reindex_block(self, block, side='left', copy=True):
  839. if side == 'left':
  840. indexer = self.lindexer
  841. mask, need_masking = self.lmask_info
  842. else:
  843. indexer = self.rindexer
  844. mask, need_masking = self.rmask_info
  845. # still some inefficiency here for bool/int64 because in the case where
  846. # no masking is needed, take_fast will recompute the mask
  847. if indexer is None and copy:
  848. result = block.copy()
  849. else:
  850. result = block.reindex_axis(indexer, mask, need_masking,
  851. axis=self.axis)
  852. result.ref_items = self.result_items
  853. return result
  854. @cache_readonly
  855. def lmask_info(self):
  856. if (self.lindexer is None or
  857. not self._may_need_upcasting(self.left.blocks)):
  858. lmask = None
  859. lneed_masking = False
  860. else:
  861. lmask = self.lindexer == -1
  862. lneed_masking = lmask.any()
  863. return lmask, lneed_masking
  864. @cache_readonly
  865. def rmask_info(self):
  866. if (self.rindexer is None or
  867. not self._may_need_upcasting(self.right.blocks)):
  868. rmask = None
  869. rneed_masking = False
  870. else:
  871. rmask = self.rindexer == -1
  872. rneed_masking = rmask.any()
  873. return rmask, rneed_masking
  874. @property
  875. def lneed_masking(self):
  876. return self.lmask_info[1]
  877. @property
  878. def rneed_masking(self):
  879. return self.rmask_info[1]
  880. @staticmethod
  881. def _may_need_upcasting(blocks):
  882. for block in blocks:
  883. if isinstance(block, (IntBlock, BoolBlock)):
  884. return True
  885. return False
  886. def _merge_blocks(self, lblk, rblk):
  887. lidx = self.lindexer
  888. ridx = self.rindexer
  889. n = lblk.values.shape[self.axis] if lidx is None else len(lidx)
  890. lk = len(lblk.items)
  891. rk = len(rblk.items)
  892. out_shape = list(lblk.shape)
  893. out_shape[0] = lk + rk
  894. out_shape[self.axis] = n
  895. out = np.empty(out_shape, dtype=lblk.values.dtype)
  896. # is this really faster than assigning to arr.flat?
  897. if lidx is None:
  898. # out[:lk] = lblk.values
  899. common.take_fast(lblk.values, np.arange(n, dtype='i4'),
  900. None, False,
  901. axis=self.axis, out=out[:lk])
  902. else:
  903. # write out the values to the result array
  904. common.take_fast(lblk.values, lidx, None, False,
  905. axis=self.axis, out=out[:lk])
  906. if ridx is None:
  907. # out[lk:] = lblk.values
  908. common.take_fast(rblk.values, np.arange(n, dtype='i4'),
  909. None, False,
  910. axis=self.axis, out=out[lk:])
  911. else:
  912. common.take_fast(rblk.values, ridx, None, False,
  913. axis=self.axis, out=out[lk:])
  914. # does not sort
  915. new_items = lblk.items.append(rblk.items)
  916. return make_block(out, new_items, self.result_items)
  917. @staticmethod
  918. def _upcast_blocks(blocks):
  919. """
  920. Upcast and consolidate if necessary
  921. """
  922. # if not need_masking:
  923. # return blocks
  924. new_blocks = []
  925. for block in blocks:
  926. if isinstance(block, IntBlock):
  927. newb = make_block(block.values.astype(float), block.items,
  928. block.ref_items)
  929. elif isinstance(block, BoolBlock):
  930. newb = make_block(block.values.astype(object), block.items,
  931. block.ref_items)
  932. else:
  933. newb = block
  934. new_blocks.append(newb)
  935. # use any ref_items
  936. return _consolidate(new_blocks, newb.ref_items)