PageRenderTime 64ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/core/internals.py

https://github.com/kljensen/pandas
Python | 1453 lines | 1269 code | 113 blank | 71 comment | 113 complexity | 6c723fa2c020e89bfc0e1ff711da021d MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import itertools
  2. from datetime import datetime
  3. from numpy import nan
  4. import numpy as np
  5. from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes
  6. import pandas.core.common as com
  7. import pandas.lib as lib
  8. class Block(object):
  9. """
  10. Canonical n-dimensional unit of homogeneous dtype contained in a pandas data
  11. structure
  12. Index-ignorant; let the container take care of that
  13. """
  14. __slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim']
  15. def __init__(self, values, items, ref_items, ndim=2,
  16. do_integrity_check=False):
  17. if issubclass(values.dtype.type, basestring):
  18. values = np.array(values, dtype=object)
  19. assert(values.ndim == ndim)
  20. assert(len(items) == len(values))
  21. self.values = values
  22. self.ndim = ndim
  23. self.items = _ensure_index(items)
  24. self.ref_items = _ensure_index(ref_items)
  25. if do_integrity_check:
  26. self._check_integrity()
  27. def _check_integrity(self):
  28. if len(self.items) < 2:
  29. return
  30. # monotonicity
  31. return (self.ref_locs[1:] > self.ref_locs[:-1]).all()
  32. _ref_locs = None
  33. @property
  34. def ref_locs(self):
  35. if self._ref_locs is None:
  36. indexer = self.ref_items.get_indexer(self.items)
  37. indexer = com._ensure_platform_int(indexer)
  38. assert((indexer != -1).all())
  39. self._ref_locs = indexer
  40. return self._ref_locs
  41. def set_ref_items(self, ref_items, maybe_rename=True):
  42. """
  43. If maybe_rename=True, need to set the items for this guy
  44. """
  45. assert(isinstance(ref_items, Index))
  46. if maybe_rename:
  47. self.items = ref_items.take(self.ref_locs)
  48. self.ref_items = ref_items
  49. def __repr__(self):
  50. shape = ' x '.join([str(s) for s in self.shape])
  51. name = type(self).__name__
  52. return '%s: %s, %s, dtype %s' % (name, self.items, shape, self.dtype)
  53. def __contains__(self, item):
  54. return item in self.items
  55. def __len__(self):
  56. return len(self.values)
  57. def __getstate__(self):
  58. # should not pickle generally (want to share ref_items), but here for
  59. # completeness
  60. return (self.items, self.ref_items, self.values)
  61. def __setstate__(self, state):
  62. items, ref_items, values = state
  63. self.items = _ensure_index(items)
  64. self.ref_items = _ensure_index(ref_items)
  65. self.values = values
  66. self.ndim = values.ndim
  67. @property
  68. def shape(self):
  69. return self.values.shape
  70. @property
  71. def dtype(self):
  72. return self.values.dtype
  73. def copy(self, deep=True):
  74. values = self.values
  75. if deep:
  76. values = values.copy()
  77. return make_block(values, self.items, self.ref_items)
  78. def merge(self, other):
  79. assert(self.ref_items.equals(other.ref_items))
  80. # Not sure whether to allow this or not
  81. # if not union_ref.equals(other.ref_items):
  82. # union_ref = self.ref_items + other.ref_items
  83. return _merge_blocks([self, other], self.ref_items)
  84. def reindex_axis(self, indexer, mask, needs_masking, axis=0,
  85. fill_value=np.nan):
  86. """
  87. Reindex using pre-computed indexer information
  88. """
  89. if self.values.size > 0:
  90. new_values = com.take_fast(self.values, indexer, mask,
  91. needs_masking, axis=axis,
  92. fill_value=fill_value)
  93. else:
  94. shape = list(self.shape)
  95. shape[axis] = len(indexer)
  96. new_values = np.empty(shape)
  97. new_values.fill(fill_value)
  98. return make_block(new_values, self.items, self.ref_items)
  99. def reindex_items_from(self, new_ref_items, copy=True):
  100. """
  101. Reindex to only those items contained in the input set of items
  102. E.g. if you have ['a', 'b'], and the input items is ['b', 'c', 'd'],
  103. then the resulting items will be ['b']
  104. Returns
  105. -------
  106. reindexed : Block
  107. """
  108. new_ref_items, indexer = self.items.reindex(new_ref_items)
  109. if indexer is None:
  110. new_items = new_ref_items
  111. new_values = self.values.copy() if copy else self.values
  112. else:
  113. mask = indexer != -1
  114. masked_idx = indexer[mask]
  115. if self.values.ndim == 2:
  116. new_values = com.take_2d(self.values, masked_idx, axis=0,
  117. needs_masking=False)
  118. else:
  119. new_values = self.values.take(masked_idx, axis=0)
  120. new_items = self.items.take(masked_idx)
  121. return make_block(new_values, new_items, new_ref_items)
  122. def get(self, item):
  123. loc = self.items.get_loc(item)
  124. return self.values[loc]
  125. def set(self, item, value):
  126. """
  127. Modify Block in-place with new item value
  128. Returns
  129. -------
  130. None
  131. """
  132. loc = self.items.get_loc(item)
  133. self.values[loc] = value
  134. def delete(self, item):
  135. """
  136. Returns
  137. -------
  138. y : Block (new object)
  139. """
  140. loc = self.items.get_loc(item)
  141. new_items = self.items.delete(loc)
  142. new_values = np.delete(self.values, loc, 0)
  143. return make_block(new_values, new_items, self.ref_items)
  144. def split_block_at(self, item):
  145. """
  146. Split block around given column, for "deleting" a column without
  147. having to copy data by returning views on the original array
  148. Returns
  149. -------
  150. leftb, rightb : (Block or None, Block or None)
  151. """
  152. loc = self.items.get_loc(item)
  153. if len(self.items) == 1:
  154. # no blocks left
  155. return None, None
  156. if loc == 0:
  157. # at front
  158. left_block = None
  159. right_block = make_block(self.values[1:], self.items[1:].copy(),
  160. self.ref_items)
  161. elif loc == len(self.values) - 1:
  162. # at back
  163. left_block = make_block(self.values[:-1], self.items[:-1].copy(),
  164. self.ref_items)
  165. right_block = None
  166. else:
  167. # in the middle
  168. left_block = make_block(self.values[:loc],
  169. self.items[:loc].copy(), self.ref_items)
  170. right_block = make_block(self.values[loc + 1:],
  171. self.items[loc + 1:].copy(),
  172. self.ref_items)
  173. return left_block, right_block
  174. def fillna(self, value, inplace=False):
  175. new_values = self.values if inplace else self.values.copy()
  176. mask = com.isnull(new_values)
  177. np.putmask(new_values, mask, value)
  178. if inplace:
  179. return self
  180. else:
  181. return make_block(new_values, self.items, self.ref_items)
  182. def _can_hold_element(self, value):
  183. raise NotImplementedError()
  184. def _try_cast(self, value):
  185. raise NotImplementedError()
  186. def replace(self, to_replace, value, inplace=False):
  187. new_values = self.values if inplace else self.values.copy()
  188. if self._can_hold_element(value):
  189. value = self._try_cast(value)
  190. if not isinstance(to_replace, (list, np.ndarray)):
  191. if self._can_hold_element(to_replace):
  192. to_replace = self._try_cast(to_replace)
  193. np.putmask(new_values, com.mask_missing(new_values, to_replace),
  194. value)
  195. else:
  196. try:
  197. to_replace = np.array(to_replace, dtype=self.dtype)
  198. np.putmask(new_values, com.mask_missing(new_values, to_replace),
  199. value)
  200. except:
  201. to_replace = np.array(to_replace, dtype=object)
  202. for r in to_replace:
  203. if self._can_hold_element(r):
  204. r = self._try_cast(r)
  205. np.putmask(new_values, com.mask_missing(new_values, to_replace),
  206. value)
  207. if inplace:
  208. return self
  209. else:
  210. return make_block(new_values, self.items, self.ref_items)
  211. def putmask(self, mask, new, inplace=False):
  212. new_values = self.values if inplace else self.values.copy()
  213. if self._can_hold_element(new):
  214. new = self._try_cast(new)
  215. np.putmask(new_values, mask, new)
  216. if inplace:
  217. return self
  218. else:
  219. return make_block(new_values, self.items, self.ref_items)
  220. def interpolate(self, method='pad', axis=0, inplace=False,
  221. limit=None, missing=None):
  222. values = self.values if inplace else self.values.copy()
  223. if values.ndim != 2:
  224. raise NotImplementedError
  225. transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
  226. if missing is None:
  227. mask = None
  228. else: # todo create faster fill func without masking
  229. mask = _mask_missing(transf(values), missing)
  230. if method == 'pad':
  231. com.pad_2d(transf(values), limit=limit, mask=mask)
  232. else:
  233. com.backfill_2d(transf(values), limit=limit, mask=mask)
  234. return make_block(values, self.items, self.ref_items)
  235. def take(self, indexer, axis=1, fill_value=np.nan):
  236. assert(axis >= 1)
  237. new_values = com.take_fast(self.values, indexer, None,
  238. None, axis=axis,
  239. fill_value=fill_value)
  240. return make_block(new_values, self.items, self.ref_items)
  241. def get_values(self, dtype):
  242. return self.values
  243. def _mask_missing(array, missing_values):
  244. if not isinstance(missing_values, (list, np.ndarray)):
  245. missing_values = [missing_values]
  246. mask = None
  247. missing_values = np.array(missing_values, dtype=object)
  248. if com.isnull(missing_values).any():
  249. mask = com.isnull(array)
  250. missing_values = missing_values[com.notnull(missing_values)]
  251. for v in missing_values:
  252. if mask is None:
  253. mask = array == missing_values
  254. else:
  255. mask |= array == missing_values
  256. return mask
  257. #-------------------------------------------------------------------------------
  258. # Is this even possible?
  259. class FloatBlock(Block):
  260. _can_hold_na = True
  261. def _can_hold_element(self, element):
  262. return isinstance(element, (float, int))
  263. def _try_cast(self, element):
  264. try:
  265. return float(element)
  266. except: # pragma: no cover
  267. return element
  268. def should_store(self, value):
  269. # when inserting a column should not coerce integers to floats
  270. # unnecessarily
  271. return issubclass(value.dtype.type, np.floating)
  272. class ComplexBlock(Block):
  273. _can_hold_na = True
  274. def _can_hold_element(self, element):
  275. return isinstance(element, complex)
  276. def _try_cast(self, element):
  277. try:
  278. return complex(element)
  279. except: # pragma: no cover
  280. return element
  281. def should_store(self, value):
  282. return issubclass(value.dtype.type, np.complexfloating)
  283. class IntBlock(Block):
  284. _can_hold_na = False
  285. def _can_hold_element(self, element):
  286. return com.is_integer(element)
  287. def _try_cast(self, element):
  288. try:
  289. return int(element)
  290. except: # pragma: no cover
  291. return element
  292. def should_store(self, value):
  293. return issubclass(value.dtype.type, np.integer)
  294. class BoolBlock(Block):
  295. _can_hold_na = False
  296. def _can_hold_element(self, element):
  297. return isinstance(element, (int, bool))
  298. def _try_cast(self, element):
  299. try:
  300. return bool(element)
  301. except: # pragma: no cover
  302. return element
  303. def should_store(self, value):
  304. return issubclass(value.dtype.type, np.bool_)
  305. class ObjectBlock(Block):
  306. _can_hold_na = True
  307. def _can_hold_element(self, element):
  308. return True
  309. def _try_cast(self, element):
  310. return element
  311. def should_store(self, value):
  312. return not issubclass(value.dtype.type,
  313. (np.integer, np.floating, np.complexfloating,
  314. np.datetime64, np.bool_))
  315. _NS_DTYPE = np.dtype('M8[ns]')
  316. class DatetimeBlock(Block):
  317. _can_hold_na = True
  318. def __init__(self, values, items, ref_items, ndim=2,
  319. do_integrity_check=False):
  320. if values.dtype != _NS_DTYPE:
  321. values = lib.cast_to_nanoseconds(values)
  322. Block.__init__(self, values, items, ref_items, ndim=ndim,
  323. do_integrity_check=do_integrity_check)
  324. def _can_hold_element(self, element):
  325. return com.is_integer(element) or isinstance(element, datetime)
  326. def _try_cast(self, element):
  327. try:
  328. return int(element)
  329. except:
  330. return element
  331. def should_store(self, value):
  332. return issubclass(value.dtype.type, np.datetime64)
  333. def set(self, item, value):
  334. """
  335. Modify Block in-place with new item value
  336. Returns
  337. -------
  338. None
  339. """
  340. loc = self.items.get_loc(item)
  341. if value.dtype != _NS_DTYPE:
  342. value = lib.cast_to_nanoseconds(value)
  343. self.values[loc] = value
  344. def get_values(self, dtype):
  345. if dtype == object:
  346. flat_i8 = self.values.ravel().view(np.int64)
  347. res = lib.ints_to_pydatetime(flat_i8)
  348. return res.reshape(self.values.shape)
  349. return self.values
  350. def make_block(values, items, ref_items, do_integrity_check=False):
  351. dtype = values.dtype
  352. vtype = dtype.type
  353. if issubclass(vtype, np.floating):
  354. klass = FloatBlock
  355. elif issubclass(vtype, np.complexfloating):
  356. klass = ComplexBlock
  357. elif issubclass(vtype, np.datetime64):
  358. klass = DatetimeBlock
  359. elif issubclass(vtype, np.integer):
  360. if vtype != np.int64:
  361. values = values.astype('i8')
  362. klass = IntBlock
  363. elif dtype == np.bool_:
  364. klass = BoolBlock
  365. else:
  366. klass = ObjectBlock
  367. return klass(values, items, ref_items, ndim=values.ndim,
  368. do_integrity_check=do_integrity_check)
  369. # TODO: flexible with index=None and/or items=None
  370. class BlockManager(object):
  371. """
  372. Core internal data structure to implement DataFrame
  373. Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
  374. lightweight blocked set of labeled data to be manipulated by the DataFrame
  375. public API class
  376. Parameters
  377. ----------
  378. Notes
  379. -----
  380. This is *not* a public API class
  381. """
  382. __slots__ = ['axes', 'blocks', 'ndim']
  383. def __init__(self, blocks, axes, do_integrity_check=True):
  384. self.axes = [_ensure_index(ax) for ax in axes]
  385. self.blocks = blocks
  386. ndim = len(axes)
  387. for block in blocks:
  388. assert(ndim == block.values.ndim)
  389. if do_integrity_check:
  390. self._verify_integrity()
  391. @classmethod
  392. def make_empty(self):
  393. return BlockManager([], [[], []])
  394. def __nonzero__(self):
  395. return True
  396. @property
  397. def ndim(self):
  398. return len(self.axes)
  399. def is_mixed_dtype(self):
  400. counts = set()
  401. for block in self.blocks:
  402. counts.add(block.dtype)
  403. if len(counts) > 1:
  404. return True
  405. return False
  406. def set_axis(self, axis, value):
  407. cur_axis = self.axes[axis]
  408. if len(value) != len(cur_axis):
  409. raise Exception('Length mismatch (%d vs %d)'
  410. % (len(value), len(cur_axis)))
  411. self.axes[axis] = _ensure_index(value)
  412. if axis == 0:
  413. for block in self.blocks:
  414. block.set_ref_items(self.items, maybe_rename=True)
  415. # make items read only for now
  416. def _get_items(self):
  417. return self.axes[0]
  418. items = property(fget=_get_items)
  419. def __getstate__(self):
  420. block_values = [b.values for b in self.blocks]
  421. block_items = [b.items for b in self.blocks]
  422. axes_array = [ax for ax in self.axes]
  423. return axes_array, block_values, block_items
  424. def __setstate__(self, state):
  425. # discard anything after 3rd, support beta pickling format for a little
  426. # while longer
  427. ax_arrays, bvalues, bitems = state[:3]
  428. self.axes = [_ensure_index(ax) for ax in ax_arrays]
  429. self.axes = _handle_legacy_indexes(self.axes)
  430. blocks = []
  431. for values, items in zip(bvalues, bitems):
  432. blk = make_block(values, items, self.axes[0],
  433. do_integrity_check=True)
  434. blocks.append(blk)
  435. self.blocks = blocks
  436. def __len__(self):
  437. return len(self.items)
  438. def __repr__(self):
  439. output = 'BlockManager'
  440. for i, ax in enumerate(self.axes):
  441. if i == 0:
  442. output += '\nItems: %s' % ax
  443. else:
  444. output += '\nAxis %d: %s' % (i, ax)
  445. for block in self.blocks:
  446. output += '\n%s' % repr(block)
  447. return output
  448. @property
  449. def shape(self):
  450. return tuple(len(ax) for ax in self.axes)
  451. def _verify_integrity(self):
  452. # _union_block_items(self.blocks)
  453. mgr_shape = self.shape
  454. for block in self.blocks:
  455. assert(block.ref_items is self.items)
  456. assert(block.values.shape[1:] == mgr_shape[1:])
  457. tot_items = sum(len(x.items) for x in self.blocks)
  458. assert(len(self.items) == tot_items)
  459. def astype(self, dtype):
  460. new_blocks = []
  461. for block in self.blocks:
  462. newb = make_block(com._astype_nansafe(block.values, dtype),
  463. block.items, block.ref_items)
  464. new_blocks.append(newb)
  465. new_mgr = BlockManager(new_blocks, self.axes)
  466. return new_mgr.consolidate()
  467. def is_consolidated(self):
  468. """
  469. Return True if more than one block with the same dtype
  470. """
  471. dtypes = [blk.dtype.type for blk in self.blocks]
  472. return len(dtypes) == len(set(dtypes))
  473. def get_numeric_data(self, copy=False, type_list=None):
  474. """
  475. Parameters
  476. ----------
  477. copy : boolean, default False
  478. Whether to copy the blocks
  479. type_list : tuple of type, default None
  480. Numeric types by default (Float/Complex/Int but not Datetime)
  481. """
  482. if type_list is None:
  483. def filter_blocks(block):
  484. return (isinstance(block, (IntBlock, FloatBlock, ComplexBlock))
  485. and not isinstance(block, DatetimeBlock))
  486. else:
  487. type_list = self._get_clean_block_types(type_list)
  488. filter_blocks = lambda block: isinstance(block, type_list)
  489. maybe_copy = lambda b: b.copy() if copy else b
  490. num_blocks = [maybe_copy(b) for b in self.blocks if filter_blocks(b)]
  491. if len(num_blocks) == 0:
  492. return BlockManager.make_empty()
  493. indexer = np.sort(np.concatenate([b.ref_locs for b in num_blocks]))
  494. new_items = self.items.take(indexer)
  495. new_blocks = []
  496. for b in num_blocks:
  497. b = b.copy(deep=False)
  498. b.ref_items = new_items
  499. new_blocks.append(b)
  500. new_axes = list(self.axes)
  501. new_axes[0] = new_items
  502. return BlockManager(new_blocks, new_axes, do_integrity_check=False)
  503. def _get_clean_block_types(self, type_list):
  504. if not isinstance(type_list, tuple):
  505. try:
  506. type_list = tuple(type_list)
  507. except TypeError:
  508. type_list = (type_list,)
  509. type_map = {int : IntBlock, float : FloatBlock,
  510. complex : ComplexBlock,
  511. np.datetime64 : DatetimeBlock,
  512. datetime : DatetimeBlock,
  513. bool : BoolBlock,
  514. object : ObjectBlock}
  515. type_list = tuple([type_map.get(t, t) for t in type_list])
  516. return type_list
  517. def get_bool_data(self, copy=False):
  518. return self.get_numeric_data(copy=copy, type_list=(BoolBlock,))
  519. def get_slice(self, slobj, axis=0):
  520. new_axes = list(self.axes)
  521. new_axes[axis] = new_axes[axis][slobj]
  522. if axis == 0:
  523. new_items = new_axes[0]
  524. if len(self.blocks) == 1:
  525. blk = self.blocks[0]
  526. newb = make_block(blk.values[slobj], new_items,
  527. new_items)
  528. new_blocks = [newb]
  529. else:
  530. return self.reindex_items(new_items)
  531. else:
  532. new_blocks = self._slice_blocks(slobj, axis)
  533. return BlockManager(new_blocks, new_axes, do_integrity_check=False)
  534. def _slice_blocks(self, slobj, axis):
  535. new_blocks = []
  536. slicer = [slice(None, None) for _ in range(self.ndim)]
  537. slicer[axis] = slobj
  538. slicer = tuple(slicer)
  539. for block in self.blocks:
  540. newb = make_block(block.values[slicer], block.items,
  541. block.ref_items)
  542. new_blocks.append(newb)
  543. return new_blocks
  544. def get_series_dict(self):
  545. # For DataFrame
  546. return _blocks_to_series_dict(self.blocks, self.axes[1])
  547. def __contains__(self, item):
  548. return item in self.items
  549. @property
  550. def nblocks(self):
  551. return len(self.blocks)
  552. def copy(self, deep=True):
  553. """
  554. Make deep or shallow copy of BlockManager
  555. Parameters
  556. ----------
  557. deep : boolean, default True
  558. If False, return shallow copy (do not copy data)
  559. Returns
  560. -------
  561. copy : BlockManager
  562. """
  563. copy_blocks = [block.copy(deep=deep) for block in self.blocks]
  564. # copy_axes = [ax.copy() for ax in self.axes]
  565. copy_axes = list(self.axes)
  566. return BlockManager(copy_blocks, copy_axes, do_integrity_check=False)
  567. def as_matrix(self, items=None):
  568. if len(self.blocks) == 0:
  569. mat = np.empty(self.shape, dtype=float)
  570. elif len(self.blocks) == 1:
  571. blk = self.blocks[0]
  572. if items is None or blk.items.equals(items):
  573. # if not, then just call interleave per below
  574. mat = blk.values
  575. else:
  576. mat = self.reindex_items(items).as_matrix()
  577. else:
  578. if items is None:
  579. mat = self._interleave(self.items)
  580. else:
  581. mat = self.reindex_items(items).as_matrix()
  582. return mat
  583. def _interleave(self, items):
  584. """
  585. Return ndarray from blocks with specified item order
  586. Items must be contained in the blocks
  587. """
  588. dtype = _interleaved_dtype(self.blocks)
  589. items = _ensure_index(items)
  590. result = np.empty(self.shape, dtype=dtype)
  591. itemmask = np.zeros(len(items), dtype=bool)
  592. # By construction, all of the item should be covered by one of the
  593. # blocks
  594. for block in self.blocks:
  595. indexer = items.get_indexer(block.items)
  596. assert((indexer != -1).all())
  597. result[indexer] = block.get_values(dtype)
  598. itemmask[indexer] = 1
  599. assert(itemmask.all())
  600. return result
  601. def xs(self, key, axis=1, copy=True):
  602. assert(axis >= 1)
  603. loc = self.axes[axis].get_loc(key)
  604. slicer = [slice(None, None) for _ in range(self.ndim)]
  605. slicer[axis] = loc
  606. slicer = tuple(slicer)
  607. new_axes = list(self.axes)
  608. # could be an array indexer!
  609. if isinstance(loc, (slice, np.ndarray)):
  610. new_axes[axis] = new_axes[axis][loc]
  611. else:
  612. new_axes.pop(axis)
  613. new_blocks = []
  614. if len(self.blocks) > 1:
  615. if not copy:
  616. raise Exception('cannot get view of mixed-type or '
  617. 'non-consolidated DataFrame')
  618. for blk in self.blocks:
  619. newb = make_block(blk.values[slicer], blk.items, blk.ref_items)
  620. new_blocks.append(newb)
  621. elif len(self.blocks) == 1:
  622. vals = self.blocks[0].values[slicer]
  623. if copy:
  624. vals = vals.copy()
  625. new_blocks = [make_block(vals, self.items, self.items)]
  626. return BlockManager(new_blocks, new_axes)
  627. def fast_2d_xs(self, loc, copy=False):
  628. """
  629. """
  630. if len(self.blocks) == 1:
  631. result = self.blocks[0].values[:, loc]
  632. if copy:
  633. result = result.copy()
  634. return result
  635. if not copy:
  636. raise Exception('cannot get view of mixed-type or '
  637. 'non-consolidated DataFrame')
  638. dtype = _interleaved_dtype(self.blocks)
  639. items = self.items
  640. n = len(items)
  641. result = np.empty(n, dtype=dtype)
  642. for blk in self.blocks:
  643. values = blk.values
  644. for j, item in enumerate(blk.items):
  645. i = items.get_loc(item)
  646. result[i] = values[j, loc]
  647. return result
  648. def consolidate(self):
  649. """
  650. Join together blocks having same dtype
  651. Returns
  652. -------
  653. y : BlockManager
  654. """
  655. if self.is_consolidated():
  656. return self
  657. new_blocks = _consolidate(self.blocks, self.items)
  658. return BlockManager(new_blocks, self.axes)
  659. def _consolidate_inplace(self):
  660. self.blocks = _consolidate(self.blocks, self.items)
  661. def get(self, item):
  662. _, block = self._find_block(item)
  663. return block.get(item)
  664. def iget(self, i):
  665. item = self.items[i]
  666. if self.items.is_unique:
  667. return self.get(item)
  668. else:
  669. # ugh
  670. try:
  671. inds, = (self.items == item).nonzero()
  672. except AttributeError: #MultiIndex
  673. inds, = self.items.map(lambda x: x == item).nonzero()
  674. _, block = self._find_block(item)
  675. try:
  676. binds, = (block.items == item).nonzero()
  677. except AttributeError: #MultiIndex
  678. binds, = block.items.map(lambda x: x == item).nonzero()
  679. for j, (k, b) in enumerate(zip(inds, binds)):
  680. if i == k:
  681. return block.values[b]
  682. raise Exception('Cannot have duplicate column names '
  683. 'split across dtypes')
  684. def get_scalar(self, tup):
  685. """
  686. Retrieve single item
  687. """
  688. item = tup[0]
  689. _, blk = self._find_block(item)
  690. # this could obviously be seriously sped up in cython
  691. item_loc = blk.items.get_loc(item),
  692. full_loc = item_loc + tuple(ax.get_loc(x)
  693. for ax, x in zip(self.axes[1:], tup[1:]))
  694. return blk.values[full_loc]
  695. def delete(self, item):
  696. i, _ = self._find_block(item)
  697. loc = self.items.get_loc(item)
  698. new_items = self.items.delete(loc)
  699. self._delete_from_block(i, item)
  700. self.set_items_norename(new_items)
  701. def set(self, item, value):
  702. """
  703. Set new item in-place. Does not consolidate. Adds new Block if not
  704. contained in the current set of items
  705. """
  706. if value.ndim == self.ndim - 1:
  707. value = value.reshape((1,) + value.shape)
  708. assert(value.shape[1:] == self.shape[1:])
  709. if item in self.items:
  710. i, block = self._find_block(item)
  711. if not block.should_store(value):
  712. # delete from block, create and append new block
  713. self._delete_from_block(i, item)
  714. self._add_new_block(item, value, loc=None)
  715. else:
  716. block.set(item, value)
  717. else:
  718. # insert at end
  719. self.insert(len(self.items), item, value)
  720. def insert(self, loc, item, value):
  721. if item in self.items:
  722. raise Exception('cannot insert %s, already exists' % item)
  723. new_items = self.items.insert(loc, item)
  724. self.set_items_norename(new_items)
  725. # new block
  726. self._add_new_block(item, value, loc=loc)
  727. if len(self.blocks) > 100:
  728. self._consolidate_inplace()
  729. def set_items_norename(self, value):
  730. value = _ensure_index(value)
  731. self.axes[0] = value
  732. for block in self.blocks:
  733. block.set_ref_items(value, maybe_rename=False)
  734. def _delete_from_block(self, i, item):
  735. """
  736. Delete and maybe remove the whole block
  737. """
  738. block = self.blocks.pop(i)
  739. new_left, new_right = block.split_block_at(item)
  740. if new_left is not None:
  741. self.blocks.append(new_left)
  742. if new_right is not None:
  743. self.blocks.append(new_right)
  744. def _add_new_block(self, item, value, loc=None):
  745. # Do we care about dtype at the moment?
  746. # hm, elaborate hack?
  747. if loc is None:
  748. loc = self.items.get_loc(item)
  749. new_block = make_block(value, self.items[loc:loc+1].copy(),
  750. self.items)
  751. self.blocks.append(new_block)
  752. def _find_block(self, item):
  753. self._check_have(item)
  754. for i, block in enumerate(self.blocks):
  755. if item in block:
  756. return i, block
  757. def _check_have(self, item):
  758. if item not in self.items:
  759. raise KeyError('no item named %s' % str(item))
  760. def reindex_axis(self, new_axis, method=None, axis=0, copy=True):
  761. new_axis = _ensure_index(new_axis)
  762. cur_axis = self.axes[axis]
  763. if new_axis.equals(cur_axis):
  764. if copy:
  765. result = self.copy(deep=True)
  766. result.axes[axis] = new_axis
  767. if axis == 0:
  768. # patch ref_items, #1823
  769. for blk in result.blocks:
  770. blk.ref_items = new_axis
  771. return result
  772. else:
  773. return self
  774. if axis == 0:
  775. assert(method is None)
  776. return self.reindex_items(new_axis)
  777. new_axis, indexer = cur_axis.reindex(new_axis, method)
  778. return self.reindex_indexer(new_axis, indexer, axis=axis)
  779. def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=np.nan):
  780. """
  781. pandas-indexer with -1's only.
  782. """
  783. if axis == 0:
  784. return self._reindex_indexer_items(new_axis, indexer, fill_value)
  785. mask = indexer == -1
  786. # TODO: deal with length-0 case? or does it fall out?
  787. needs_masking = len(new_axis) > 0 and mask.any()
  788. new_blocks = []
  789. for block in self.blocks:
  790. newb = block.reindex_axis(indexer, mask, needs_masking,
  791. axis=axis, fill_value=fill_value)
  792. new_blocks.append(newb)
  793. new_axes = list(self.axes)
  794. new_axes[axis] = new_axis
  795. return BlockManager(new_blocks, new_axes)
  796. def _reindex_indexer_items(self, new_items, indexer, fill_value):
  797. # TODO: less efficient than I'd like
  798. item_order = com.take_1d(self.items.values, indexer)
  799. # keep track of what items aren't found anywhere
  800. mask = np.zeros(len(item_order), dtype=bool)
  801. new_blocks = []
  802. for blk in self.blocks:
  803. blk_indexer = blk.items.get_indexer(item_order)
  804. selector = blk_indexer != -1
  805. # update with observed items
  806. mask |= selector
  807. if not selector.any():
  808. continue
  809. new_block_items = new_items.take(selector.nonzero()[0])
  810. new_values = com.take_fast(blk.values, blk_indexer[selector],
  811. None, False, axis=0)
  812. new_blocks.append(make_block(new_values, new_block_items,
  813. new_items))
  814. if not mask.all():
  815. na_items = new_items[-mask]
  816. na_block = self._make_na_block(na_items, new_items,
  817. fill_value=fill_value)
  818. new_blocks.append(na_block)
  819. new_blocks = _consolidate(new_blocks, new_items)
  820. return BlockManager(new_blocks, [new_items] + self.axes[1:])
  821. def reindex_items(self, new_items, copy=True, fill_value=np.nan):
  822. """
  823. """
  824. new_items = _ensure_index(new_items)
  825. data = self
  826. if not data.is_consolidated():
  827. data = data.consolidate()
  828. return data.reindex_items(new_items)
  829. # TODO: this part could be faster (!)
  830. new_items, indexer = self.items.reindex(new_items)
  831. # could have some pathological (MultiIndex) issues here
  832. new_blocks = []
  833. if indexer is None:
  834. for blk in self.blocks:
  835. if copy:
  836. new_blocks.append(blk.reindex_items_from(new_items))
  837. else:
  838. blk.ref_items = new_items
  839. new_blocks.append(blk)
  840. else:
  841. for block in self.blocks:
  842. newb = block.reindex_items_from(new_items, copy=copy)
  843. if len(newb.items) > 0:
  844. new_blocks.append(newb)
  845. mask = indexer == -1
  846. if mask.any():
  847. extra_items = new_items[mask]
  848. na_block = self._make_na_block(extra_items, new_items,
  849. fill_value=fill_value)
  850. new_blocks.append(na_block)
  851. new_blocks = _consolidate(new_blocks, new_items)
  852. return BlockManager(new_blocks, [new_items] + self.axes[1:])
  853. def _make_na_block(self, items, ref_items, fill_value=np.nan):
  854. # TODO: infer dtypes other than float64 from fill_value
  855. block_shape = list(self.shape)
  856. block_shape[0] = len(items)
  857. dtype = com._infer_dtype(fill_value)
  858. block_values = np.empty(block_shape, dtype=dtype)
  859. block_values.fill(fill_value)
  860. na_block = make_block(block_values, items, ref_items,
  861. do_integrity_check=True)
  862. return na_block
  863. def take(self, indexer, axis=1):
  864. if axis == 0:
  865. raise NotImplementedError
  866. indexer = np.asarray(indexer, dtype='i4')
  867. n = len(self.axes[axis])
  868. if ((indexer == -1) | (indexer >= n)).any():
  869. raise Exception('Indices must be nonzero and less than '
  870. 'the axis length')
  871. new_axes = list(self.axes)
  872. new_axes[axis] = self.axes[axis].take(indexer)
  873. new_blocks = []
  874. for blk in self.blocks:
  875. new_values = com.take_fast(blk.values, indexer,
  876. None, False, axis=axis)
  877. newb = make_block(new_values, blk.items, self.items)
  878. new_blocks.append(newb)
  879. return BlockManager(new_blocks, new_axes)
  880. def merge(self, other, lsuffix=None, rsuffix=None):
  881. assert(self._is_indexed_like(other))
  882. this, other = self._maybe_rename_join(other, lsuffix, rsuffix)
  883. cons_items = this.items + other.items
  884. consolidated = _consolidate(this.blocks + other.blocks, cons_items)
  885. new_axes = list(this.axes)
  886. new_axes[0] = cons_items
  887. return BlockManager(consolidated, new_axes)
  888. def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True):
  889. to_rename = self.items.intersection(other.items)
  890. if len(to_rename) > 0:
  891. if not lsuffix and not rsuffix:
  892. raise Exception('columns overlap: %s' % to_rename)
  893. def lrenamer(x):
  894. if x in to_rename:
  895. return '%s%s' % (x, lsuffix)
  896. return x
  897. def rrenamer(x):
  898. if x in to_rename:
  899. return '%s%s' % (x, rsuffix)
  900. return x
  901. this = self.rename_items(lrenamer, copydata=copydata)
  902. other = other.rename_items(rrenamer, copydata=copydata)
  903. else:
  904. this = self
  905. return this, other
  906. def _is_indexed_like(self, other):
  907. """
  908. Check all axes except items
  909. """
  910. assert(self.ndim == other.ndim)
  911. for ax, oax in zip(self.axes[1:], other.axes[1:]):
  912. if not ax.equals(oax):
  913. return False
  914. return True
  915. def rename_axis(self, mapper, axis=1):
  916. new_axis = Index([mapper(x) for x in self.axes[axis]])
  917. assert(new_axis.is_unique)
  918. new_axes = list(self.axes)
  919. new_axes[axis] = new_axis
  920. return BlockManager(self.blocks, new_axes)
  921. def rename_items(self, mapper, copydata=True):
  922. new_items = Index([mapper(x) for x in self.items])
  923. new_items.is_unique
  924. new_blocks = []
  925. for block in self.blocks:
  926. newb = block.copy(deep=copydata)
  927. newb.set_ref_items(new_items, maybe_rename=True)
  928. new_blocks.append(newb)
  929. new_axes = list(self.axes)
  930. new_axes[0] = new_items
  931. return BlockManager(new_blocks, new_axes)
  932. def add_prefix(self, prefix):
  933. f = (('%s' % prefix) + '%s').__mod__
  934. return self.rename_items(f)
  935. def add_suffix(self, suffix):
  936. f = ('%s' + ('%s' % suffix)).__mod__
  937. return self.rename_items(f)
  938. def fillna(self, value, inplace=False):
  939. new_blocks = [b.fillna(value, inplace=inplace)
  940. if b._can_hold_na else b
  941. for b in self.blocks]
  942. if inplace:
  943. return self
  944. return BlockManager(new_blocks, self.axes)
  945. def replace(self, to_replace, value, inplace=False):
  946. new_blocks = [b.replace(to_replace, value, inplace=inplace)
  947. for b in self.blocks]
  948. if inplace:
  949. return self
  950. return BlockManager(new_blocks, self.axes)
  951. def _replace_list(self, src_lst, dest_lst):
  952. sset = set(src_lst)
  953. if any([k in sset for k in dest_lst]):
  954. masks = {}
  955. for s in src_lst:
  956. masks[s] = [b.values == s for b in self.blocks]
  957. for s, d in zip(src_lst, dest_lst):
  958. [b.putmask(masks[s][i], d, inplace=True) for i, b in
  959. enumerate(self.blocks)]
  960. else:
  961. for s, d in zip(src_lst, dest_lst):
  962. self.replace(s, d, inplace=True)
  963. return self
  964. @property
  965. def block_id_vector(self):
  966. # TODO
  967. result = np.empty(len(self.items), dtype=int)
  968. result.fill(-1)
  969. for i, blk in enumerate(self.blocks):
  970. indexer = self.items.get_indexer(blk.items)
  971. assert((indexer != -1).all())
  972. result.put(indexer, i)
  973. assert((result >= 0).all())
  974. return result
  975. @property
  976. def item_dtypes(self):
  977. result = np.empty(len(self.items), dtype='O')
  978. mask = np.zeros(len(self.items), dtype=bool)
  979. for i, blk in enumerate(self.blocks):
  980. indexer = self.items.get_indexer(blk.items)
  981. result.put(indexer, blk.values.dtype.name)
  982. mask.put(indexer, 1)
  983. assert(mask.all())
  984. return result
  985. def form_blocks(data, axes):
  986. # pre-filter out items if we passed it
  987. items = axes[0]
  988. if len(data) < len(items):
  989. extra_items = items - Index(data.keys())
  990. else:
  991. extra_items = []
  992. # put "leftover" items in float bucket, where else?
  993. # generalize?
  994. float_dict = {}
  995. complex_dict = {}
  996. int_dict = {}
  997. bool_dict = {}
  998. object_dict = {}
  999. datetime_dict = {}
  1000. for k, v in data.iteritems():
  1001. if issubclass(v.dtype.type, np.floating):
  1002. float_dict[k] = v
  1003. elif issubclass(v.dtype.type, np.complexfloating):
  1004. complex_dict[k] = v
  1005. elif issubclass(v.dtype.type, np.datetime64):
  1006. datetime_dict[k] = v
  1007. elif issubclass(v.dtype.type, np.integer):
  1008. int_dict[k] = v
  1009. elif v.dtype == np.bool_:
  1010. bool_dict[k] = v
  1011. else:
  1012. object_dict[k] = v
  1013. blocks = []
  1014. if len(float_dict):
  1015. float_block = _simple_blockify(float_dict, items, np.float64)
  1016. blocks.append(float_block)
  1017. if len(complex_dict):
  1018. complex_block = _simple_blockify(complex_dict, items, np.complex128)
  1019. blocks.append(complex_block)
  1020. if len(int_dict):
  1021. int_block = _simple_blockify(int_dict, items, np.int64)
  1022. blocks.append(int_block)
  1023. for k, v in list(datetime_dict.items()):
  1024. # hackeroo
  1025. if hasattr(v, 'tz') and v.tz is not None:
  1026. del datetime_dict[k]
  1027. object_dict[k] = v.asobject
  1028. if len(datetime_dict):
  1029. datetime_block = _simple_blockify(datetime_dict, items,
  1030. np.dtype('M8[ns]'))
  1031. blocks.append(datetime_block)
  1032. if len(bool_dict):
  1033. bool_block = _simple_blockify(bool_dict, items, np.bool_)
  1034. blocks.append(bool_block)
  1035. if len(object_dict) > 0:
  1036. object_block = _simple_blockify(object_dict, items, np.object_)
  1037. blocks.append(object_block)
  1038. if len(extra_items):
  1039. shape = (len(extra_items),) + tuple(len(x) for x in axes[1:])
  1040. # empty items -> dtype object
  1041. block_values = np.empty(shape, dtype=object)
  1042. block_values.fill(nan)
  1043. na_block = make_block(block_values, extra_items, items,
  1044. do_integrity_check=True)
  1045. blocks.append(na_block)
  1046. blocks = _consolidate(blocks, items)
  1047. return blocks
  1048. def _simple_blockify(dct, ref_items, dtype):
  1049. block_items, values = _stack_dict(dct, ref_items, dtype)
  1050. # CHECK DTYPE?
  1051. if values.dtype != dtype: # pragma: no cover
  1052. values = values.astype(dtype)
  1053. return make_block(values, block_items, ref_items, do_integrity_check=True)
  1054. def _stack_dict(dct, ref_items, dtype):
  1055. from pandas.core.series import Series
  1056. # fml
  1057. def _asarray_compat(x):
  1058. # asarray shouldn't be called on SparseSeries
  1059. if isinstance(x, Series):
  1060. return x.values
  1061. else:
  1062. return np.asarray(x)
  1063. def _shape_compat(x):
  1064. # sparseseries
  1065. if isinstance(x, Series):
  1066. return len(x),
  1067. else:
  1068. return x.shape
  1069. # index may box values
  1070. items = ref_items[[x in dct for x in ref_items]]
  1071. first = dct[items[0]]
  1072. shape = (len(dct),) + _shape_compat(first)
  1073. stacked = np.empty(shape, dtype=dtype)
  1074. for i, item in enumerate(items):
  1075. stacked[i] = _asarray_compat(dct[item])
  1076. # stacked = np.vstack([_asarray_compat(dct[k]) for k in items])
  1077. return items, stacked
  1078. def _blocks_to_series_dict(blocks, index=None):
  1079. from pandas.core.series import Series
  1080. series_dict = {}
  1081. for block in blocks:
  1082. for item, vec in zip(block.items, block.values):
  1083. series_dict[item] = Series(vec, index=index, name=item)
  1084. return series_dict
  1085. def _interleaved_dtype(blocks):
  1086. from collections import defaultdict
  1087. counts = defaultdict(lambda: 0)
  1088. for x in blocks:
  1089. counts[type(x)] += 1
  1090. have_int = counts[IntBlock] > 0
  1091. have_bool = counts[BoolBlock] > 0
  1092. have_object = counts[ObjectBlock] > 0
  1093. have_float = counts[FloatBlock] > 0
  1094. have_complex = counts[ComplexBlock] > 0
  1095. have_dt64 = counts[DatetimeBlock] > 0
  1096. have_numeric = have_float or have_complex or have_int
  1097. if (have_object or
  1098. (have_bool and have_numeric) or
  1099. (have_numeric and have_dt64)):
  1100. return np.dtype(object)
  1101. elif have_bool:
  1102. return np.dtype(bool)
  1103. elif have_int and not have_float and not have_complex:
  1104. return np.dtype('i8')
  1105. elif have_dt64 and not have_float and not have_complex:
  1106. return np.dtype('M8[ns]')
  1107. elif have_complex:
  1108. return np.dtype('c16')
  1109. else:
  1110. return np.dtype('f8')
  1111. def _consolidate(blocks, items):
  1112. """
  1113. Merge blocks having same dtype
  1114. """
  1115. get_dtype = lambda x: x.dtype.name
  1116. # sort by dtype
  1117. grouper = itertools.groupby(sorted(blocks, key=get_dtype),
  1118. lambda x: x.dtype)
  1119. new_blocks = []
  1120. for dtype, group_blocks in grouper:
  1121. new_block = _merge_blocks(list(group_blocks), items)
  1122. new_blocks.append(new_block)
  1123. return new_blocks
  1124. # TODO: this could be much optimized
  1125. def _merge_blocks(blocks, items):
  1126. if len(blocks) == 1:
  1127. return blocks[0]
  1128. new_values = _vstack([b.values for b in blocks])
  1129. new_items = blocks[0].items.append([b.items for b in blocks[1:]])
  1130. new_block = make_block(new_values, new_items, items,
  1131. do_integrity_check=True)
  1132. return new_block.reindex_items_from(items)
  1133. def _union_block_items(blocks):
  1134. tot_len = 0
  1135. all_items = []
  1136. slow = False
  1137. for b in blocks:
  1138. tot_len += len(b.items)
  1139. if type(b.items) != Index:
  1140. slow = True
  1141. all_items.append(b.items)
  1142. if slow:
  1143. the_union = _union_items_slow(all_items)
  1144. else:
  1145. the_union = Index(lib.fast_unique_multiple(all_items))
  1146. if tot_len > len(the_union):
  1147. raise Exception('item names overlap')
  1148. return the_union
  1149. def _union_items_slow(all_items):
  1150. seen = None
  1151. for items in all_items:
  1152. if seen is None:
  1153. seen = items
  1154. else:
  1155. seen = seen.union(items)
  1156. return seen
  1157. def _vstack(to_stack):
  1158. if all(x.dtype == _NS_DTYPE for x in to_stack):
  1159. # work around NumPy 1.6 bug
  1160. new_values = np.vstack([x.view('i8') for x in to_stack])
  1161. return new_values.view(_NS_DTYPE)
  1162. else:
  1163. return np.vstack(to_stack)