PageRenderTime 78ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/core/internals.py

https://github.com/ajcr/pandas
Python | 4069 lines | 3422 code | 325 blank | 322 comment | 311 complexity | ea364a4280ea861ebe94fc70579f020b MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import copy
  2. import itertools
  3. import re
  4. import operator
  5. from datetime import datetime, timedelta
  6. from collections import defaultdict
  7. import numpy as np
  8. from pandas.core.base import PandasObject
  9. from pandas.core.common import (_possibly_downcast_to_dtype, isnull,
  10. _NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like,
  11. ABCSparseSeries, _infer_dtype_from_scalar,
  12. _is_null_datelike_scalar,
  13. is_timedelta64_dtype, is_datetime64_dtype,
  14. _possibly_infer_to_datetimelike)
  15. from pandas.core.index import Index, MultiIndex, _ensure_index
  16. from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer)
  17. import pandas.core.common as com
  18. from pandas.sparse.array import _maybe_to_sparse, SparseArray
  19. import pandas.lib as lib
  20. import pandas.tslib as tslib
  21. import pandas.computation.expressions as expressions
  22. from pandas.util.decorators import cache_readonly
  23. from pandas.tslib import Timestamp
  24. from pandas import compat
  25. from pandas.compat import range, map, zip, u
  26. from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type
  27. from pandas.lib import BlockPlacement
  28. class Block(PandasObject):
  29. """
  30. Canonical n-dimensional unit of homogeneous dtype contained in a pandas
  31. data structure
  32. Index-ignorant; let the container take care of that
  33. """
  34. __slots__ = ['_mgr_locs', 'values', 'ndim']
  35. is_numeric = False
  36. is_float = False
  37. is_integer = False
  38. is_complex = False
  39. is_datetime = False
  40. is_timedelta = False
  41. is_bool = False
  42. is_object = False
  43. is_sparse = False
  44. _can_hold_na = False
  45. _downcast_dtype = None
  46. _can_consolidate = True
  47. _verify_integrity = True
  48. _ftype = 'dense'
  49. def __init__(self, values, placement, ndim=None, fastpath=False):
  50. if ndim is None:
  51. ndim = values.ndim
  52. elif values.ndim != ndim:
  53. raise ValueError('Wrong number of dimensions')
  54. self.ndim = ndim
  55. self.mgr_locs = placement
  56. self.values = values
  57. if len(self.mgr_locs) != len(self.values):
  58. raise ValueError('Wrong number of items passed %d,'
  59. ' placement implies %d' % (
  60. len(self.values), len(self.mgr_locs)))
  61. @property
  62. def _consolidate_key(self):
  63. return (self._can_consolidate, self.dtype.name)
  64. @property
  65. def _is_single_block(self):
  66. return self.ndim == 1
  67. @property
  68. def is_datelike(self):
  69. """ return True if I am a non-datelike """
  70. return self.is_datetime or self.is_timedelta
  71. @property
  72. def fill_value(self):
  73. return np.nan
  74. @property
  75. def mgr_locs(self):
  76. return self._mgr_locs
  77. def make_block_same_class(self, values, placement, copy=False,
  78. **kwargs):
  79. """
  80. Wrap given values in a block of same type as self.
  81. `kwargs` are used in SparseBlock override.
  82. """
  83. if copy:
  84. values = values.copy()
  85. return make_block(values, placement, klass=self.__class__,
  86. fastpath=True)
  87. @mgr_locs.setter
  88. def mgr_locs(self, new_mgr_locs):
  89. if not isinstance(new_mgr_locs, BlockPlacement):
  90. new_mgr_locs = BlockPlacement(new_mgr_locs)
  91. self._mgr_locs = new_mgr_locs
  92. def __unicode__(self):
  93. # don't want to print out all of the items here
  94. name = com.pprint_thing(self.__class__.__name__)
  95. if self._is_single_block:
  96. result = '%s: %s dtype: %s' % (
  97. name, len(self), self.dtype)
  98. else:
  99. shape = ' x '.join([com.pprint_thing(s) for s in self.shape])
  100. result = '%s: %s, %s, dtype: %s' % (
  101. name, com.pprint_thing(self.mgr_locs.indexer), shape,
  102. self.dtype)
  103. return result
  104. def __len__(self):
  105. return len(self.values)
  106. def __getstate__(self):
  107. return self.mgr_locs.indexer, self.values
  108. def __setstate__(self, state):
  109. self.mgr_locs = BlockPlacement(state[0])
  110. self.values = state[1]
  111. self.ndim = self.values.ndim
  112. def _slice(self, slicer):
  113. """ return a slice of my values """
  114. return self.values[slicer]
  115. def getitem_block(self, slicer, new_mgr_locs=None):
  116. """
  117. Perform __getitem__-like, return result as block.
  118. As of now, only supports slices that preserve dimensionality.
  119. """
  120. if new_mgr_locs is None:
  121. if isinstance(slicer, tuple):
  122. axis0_slicer = slicer[0]
  123. else:
  124. axis0_slicer = slicer
  125. new_mgr_locs = self.mgr_locs[axis0_slicer]
  126. new_values = self._slice(slicer)
  127. if new_values.ndim != self.ndim:
  128. raise ValueError("Only same dim slicing is allowed")
  129. return self.make_block_same_class(new_values, new_mgr_locs)
  130. @property
  131. def shape(self):
  132. return self.values.shape
  133. @property
  134. def itemsize(self):
  135. return self.values.itemsize
  136. @property
  137. def dtype(self):
  138. return self.values.dtype
  139. @property
  140. def ftype(self):
  141. return "%s:%s" % (self.dtype, self._ftype)
  142. def merge(self, other):
  143. return _merge_blocks([self, other])
  144. def reindex_axis(self, indexer, method=None, axis=1, fill_value=None,
  145. limit=None, mask_info=None):
  146. """
  147. Reindex using pre-computed indexer information
  148. """
  149. if axis < 1:
  150. raise AssertionError('axis must be at least 1, got %d' % axis)
  151. if fill_value is None:
  152. fill_value = self.fill_value
  153. new_values = com.take_nd(self.values, indexer, axis,
  154. fill_value=fill_value, mask_info=mask_info)
  155. return make_block(new_values,
  156. ndim=self.ndim, fastpath=True,
  157. placement=self.mgr_locs)
  158. def get(self, item):
  159. loc = self.items.get_loc(item)
  160. return self.values[loc]
  161. def iget(self, i):
  162. return self.values[i]
  163. def set(self, locs, values, check=False):
  164. """
  165. Modify Block in-place with new item value
  166. Returns
  167. -------
  168. None
  169. """
  170. self.values[locs] = values
  171. def delete(self, loc):
  172. """
  173. Delete given loc(-s) from block in-place.
  174. """
  175. self.values = np.delete(self.values, loc, 0)
  176. self.mgr_locs = self.mgr_locs.delete(loc)
  177. def apply(self, func, **kwargs):
  178. """ apply the function to my values; return a block if we are not one """
  179. result = func(self.values)
  180. if not isinstance(result, Block):
  181. result = make_block(values=result, placement=self.mgr_locs,)
  182. return result
  183. def fillna(self, value, limit=None, inplace=False, downcast=None):
  184. if not self._can_hold_na:
  185. if inplace:
  186. return [self]
  187. else:
  188. return [self.copy()]
  189. mask = isnull(self.values)
  190. if limit is not None:
  191. if self.ndim > 2:
  192. raise NotImplementedError
  193. mask[mask.cumsum(self.ndim-1)>limit]=False
  194. value = self._try_fill(value)
  195. blocks = self.putmask(mask, value, inplace=inplace)
  196. return self._maybe_downcast(blocks, downcast)
  197. def _maybe_downcast(self, blocks, downcast=None):
  198. # no need to downcast our float
  199. # unless indicated
  200. if downcast is None and self.is_float:
  201. return blocks
  202. elif downcast is None and (self.is_timedelta or self.is_datetime):
  203. return blocks
  204. result_blocks = []
  205. for b in blocks:
  206. result_blocks.extend(b.downcast(downcast))
  207. return result_blocks
  208. def downcast(self, dtypes=None):
  209. """ try to downcast each item to the dict of dtypes if present """
  210. # turn it off completely
  211. if dtypes is False:
  212. return [self]
  213. values = self.values
  214. # single block handling
  215. if self._is_single_block:
  216. # try to cast all non-floats here
  217. if dtypes is None:
  218. dtypes = 'infer'
  219. nv = _possibly_downcast_to_dtype(values, dtypes)
  220. return [make_block(nv, ndim=self.ndim,
  221. fastpath=True, placement=self.mgr_locs)]
  222. # ndim > 1
  223. if dtypes is None:
  224. return [self]
  225. if not (dtypes == 'infer' or isinstance(dtypes, dict)):
  226. raise ValueError("downcast must have a dictionary or 'infer' as "
  227. "its argument")
  228. # item-by-item
  229. # this is expensive as it splits the blocks items-by-item
  230. blocks = []
  231. for i, rl in enumerate(self.mgr_locs):
  232. if dtypes == 'infer':
  233. dtype = 'infer'
  234. else:
  235. raise AssertionError("dtypes as dict is not supported yet")
  236. dtype = dtypes.get(item, self._downcast_dtype)
  237. if dtype is None:
  238. nv = _block_shape(values[i], ndim=self.ndim)
  239. else:
  240. nv = _possibly_downcast_to_dtype(values[i], dtype)
  241. nv = _block_shape(nv, ndim=self.ndim)
  242. blocks.append(make_block(nv,
  243. ndim=self.ndim, fastpath=True,
  244. placement=[rl]))
  245. return blocks
  246. def astype(self, dtype, copy=False, raise_on_error=True, values=None):
  247. return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
  248. values=values)
  249. def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
  250. klass=None):
  251. """
  252. Coerce to the new type (if copy=True, return a new copy)
  253. raise on an except if raise == True
  254. """
  255. dtype = np.dtype(dtype)
  256. if self.dtype == dtype:
  257. if copy:
  258. return self.copy()
  259. return self
  260. try:
  261. # force the copy here
  262. if values is None:
  263. # _astype_nansafe works fine with 1-d only
  264. values = com._astype_nansafe(self.values.ravel(), dtype, copy=True)
  265. values = values.reshape(self.values.shape)
  266. newb = make_block(values,
  267. ndim=self.ndim, placement=self.mgr_locs,
  268. fastpath=True, dtype=dtype, klass=klass)
  269. except:
  270. if raise_on_error is True:
  271. raise
  272. newb = self.copy() if copy else self
  273. if newb.is_numeric and self.is_numeric:
  274. if newb.shape != self.shape:
  275. raise TypeError("cannot set astype for copy = [%s] for dtype "
  276. "(%s [%s]) with smaller itemsize that current "
  277. "(%s [%s])" % (copy, self.dtype.name,
  278. self.itemsize, newb.dtype.name,
  279. newb.itemsize))
  280. return newb
  281. def convert(self, copy=True, **kwargs):
  282. """ attempt to coerce any object types to better types
  283. return a copy of the block (if copy = True)
  284. by definition we are not an ObjectBlock here! """
  285. return [self.copy()] if copy else [self]
  286. def _can_hold_element(self, value):
  287. raise NotImplementedError()
  288. def _try_cast(self, value):
  289. raise NotImplementedError()
  290. def _try_cast_result(self, result, dtype=None):
  291. """ try to cast the result to our original type,
  292. we may have roundtripped thru object in the mean-time """
  293. if dtype is None:
  294. dtype = self.dtype
  295. if self.is_integer or self.is_bool or self.is_datetime:
  296. pass
  297. elif self.is_float and result.dtype == self.dtype:
  298. # protect against a bool/object showing up here
  299. if isinstance(dtype, compat.string_types) and dtype == 'infer':
  300. return result
  301. if not isinstance(dtype, type):
  302. dtype = dtype.type
  303. if issubclass(dtype, (np.bool_, np.object_)):
  304. if issubclass(dtype, np.bool_):
  305. if isnull(result).all():
  306. return result.astype(np.bool_)
  307. else:
  308. result = result.astype(np.object_)
  309. result[result == 1] = True
  310. result[result == 0] = False
  311. return result
  312. else:
  313. return result.astype(np.object_)
  314. return result
  315. # may need to change the dtype here
  316. return _possibly_downcast_to_dtype(result, dtype)
  317. def _try_operate(self, values):
  318. """ return a version to operate on as the input """
  319. return values
  320. def _try_coerce_args(self, values, other):
  321. """ provide coercion to our input arguments """
  322. return values, other
  323. def _try_coerce_result(self, result):
  324. """ reverse of try_coerce_args """
  325. return result
  326. def _try_coerce_and_cast_result(self, result, dtype=None):
  327. result = self._try_coerce_result(result)
  328. result = self._try_cast_result(result, dtype=dtype)
  329. return result
  330. def _try_fill(self, value):
  331. return value
  332. def to_native_types(self, slicer=None, na_rep='', **kwargs):
  333. """ convert to our native types format, slicing if desired """
  334. values = self.values
  335. if slicer is not None:
  336. values = values[:, slicer]
  337. values = np.array(values, dtype=object)
  338. mask = isnull(values)
  339. values[mask] = na_rep
  340. return values.tolist()
  341. # block actions ####
  342. def copy(self, deep=True):
  343. values = self.values
  344. if deep:
  345. values = values.copy()
  346. return make_block(values, ndim=self.ndim,
  347. klass=self.__class__, fastpath=True,
  348. placement=self.mgr_locs)
  349. def replace(self, to_replace, value, inplace=False, filter=None,
  350. regex=False):
  351. """ replace the to_replace value with value, possible to create new
  352. blocks here this is just a call to putmask. regex is not used here.
  353. It is used in ObjectBlocks. It is here for API
  354. compatibility."""
  355. mask = com.mask_missing(self.values, to_replace)
  356. if filter is not None:
  357. filtered_out = ~self.mgr_locs.isin(filter)
  358. mask[filtered_out.nonzero()[0]] = False
  359. if not mask.any():
  360. if inplace:
  361. return [self]
  362. return [self.copy()]
  363. return self.putmask(mask, value, inplace=inplace)
  364. def setitem(self, indexer, value):
  365. """ set the value inplace; return a new block (of a possibly different
  366. dtype)
  367. indexer is a direct slice/positional indexer; value must be a
  368. compatible shape
  369. """
  370. # coerce args
  371. values, value = self._try_coerce_args(self.values, value)
  372. arr_value = np.array(value)
  373. # cast the values to a type that can hold nan (if necessary)
  374. if not self._can_hold_element(value):
  375. dtype, _ = com._maybe_promote(arr_value.dtype)
  376. values = values.astype(dtype)
  377. transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x)
  378. values = transf(values)
  379. l = len(values)
  380. # length checking
  381. # boolean with truth values == len of the value is ok too
  382. if isinstance(indexer, (np.ndarray, list)):
  383. if is_list_like(value) and len(indexer) != len(value):
  384. if not (isinstance(indexer, np.ndarray) and
  385. indexer.dtype == np.bool_ and
  386. len(indexer[indexer]) == len(value)):
  387. raise ValueError("cannot set using a list-like indexer "
  388. "with a different length than the value")
  389. # slice
  390. elif isinstance(indexer, slice):
  391. if is_list_like(value) and l:
  392. if len(value) != _length_of_indexer(indexer, values):
  393. raise ValueError("cannot set using a slice indexer with a "
  394. "different length than the value")
  395. try:
  396. # setting a single element for each dim and with a rhs that could be say a list
  397. # GH 6043
  398. if arr_value.ndim == 1 and (
  399. np.isscalar(indexer) or (isinstance(indexer, tuple) and all([ np.isscalar(idx) for idx in indexer ]))):
  400. values[indexer] = value
  401. # if we are an exact match (ex-broadcasting),
  402. # then use the resultant dtype
  403. elif len(arr_value.shape) and arr_value.shape[0] == values.shape[0] and np.prod(arr_value.shape) == np.prod(values.shape):
  404. values[indexer] = value
  405. values = values.astype(arr_value.dtype)
  406. # set
  407. else:
  408. values[indexer] = value
  409. # coerce and try to infer the dtypes of the result
  410. if np.isscalar(value):
  411. dtype, _ = _infer_dtype_from_scalar(value)
  412. else:
  413. dtype = 'infer'
  414. values = self._try_coerce_and_cast_result(values, dtype)
  415. return [make_block(transf(values),
  416. ndim=self.ndim, placement=self.mgr_locs,
  417. fastpath=True)]
  418. except (ValueError, TypeError) as detail:
  419. raise
  420. except Exception as detail:
  421. pass
  422. return [self]
  423. def putmask(self, mask, new, align=True, inplace=False):
  424. """ putmask the data to the block; it is possible that we may create a
  425. new dtype of block
  426. return the resulting block(s)
  427. Parameters
  428. ----------
  429. mask : the condition to respect
  430. new : a ndarray/object
  431. align : boolean, perform alignment on other/cond, default is True
  432. inplace : perform inplace modification, default is False
  433. Returns
  434. -------
  435. a new block(s), the result of the putmask
  436. """
  437. new_values = self.values if inplace else self.values.copy()
  438. # may need to align the new
  439. if hasattr(new, 'reindex_axis'):
  440. new = new.values.T
  441. # may need to align the mask
  442. if hasattr(mask, 'reindex_axis'):
  443. mask = mask.values.T
  444. # if we are passed a scalar None, convert it here
  445. if not is_list_like(new) and isnull(new):
  446. new = self.fill_value
  447. if self._can_hold_element(new):
  448. new = self._try_cast(new)
  449. # pseudo-broadcast
  450. if isinstance(new, np.ndarray) and new.ndim == self.ndim - 1:
  451. new = np.repeat(new, self.shape[-1]).reshape(self.shape)
  452. np.putmask(new_values, mask, new)
  453. # maybe upcast me
  454. elif mask.any():
  455. # need to go column by column
  456. new_blocks = []
  457. if self.ndim > 1:
  458. for i, ref_loc in enumerate(self.mgr_locs):
  459. m = mask[i]
  460. v = new_values[i]
  461. # need a new block
  462. if m.any():
  463. n = new[i] if isinstance(
  464. new, np.ndarray) else np.array(new)
  465. # type of the new block
  466. dtype, _ = com._maybe_promote(n.dtype)
  467. # we need to exiplicty astype here to make a copy
  468. n = n.astype(dtype)
  469. nv = _putmask_smart(v, m, n)
  470. else:
  471. nv = v if inplace else v.copy()
  472. # Put back the dimension that was taken from it and make
  473. # a block out of the result.
  474. block = make_block(values=nv[np.newaxis],
  475. placement=[ref_loc],
  476. fastpath=True)
  477. new_blocks.append(block)
  478. else:
  479. nv = _putmask_smart(new_values, mask, new)
  480. new_blocks.append(make_block(values=nv,
  481. placement=self.mgr_locs,
  482. fastpath=True))
  483. return new_blocks
  484. if inplace:
  485. return [self]
  486. return [make_block(new_values,
  487. placement=self.mgr_locs, fastpath=True)]
  488. def interpolate(self, method='pad', axis=0, index=None,
  489. values=None, inplace=False, limit=None,
  490. fill_value=None, coerce=False, downcast=None, **kwargs):
  491. def check_int_bool(self, inplace):
  492. # Only FloatBlocks will contain NaNs.
  493. # timedelta subclasses IntBlock
  494. if (self.is_bool or self.is_integer) and not self.is_timedelta:
  495. if inplace:
  496. return self
  497. else:
  498. return self.copy()
  499. # a fill na type method
  500. try:
  501. m = com._clean_fill_method(method)
  502. except:
  503. m = None
  504. if m is not None:
  505. r = check_int_bool(self, inplace)
  506. if r is not None:
  507. return r
  508. return self._interpolate_with_fill(method=m,
  509. axis=axis,
  510. inplace=inplace,
  511. limit=limit,
  512. fill_value=fill_value,
  513. coerce=coerce,
  514. downcast=downcast)
  515. # try an interp method
  516. try:
  517. m = com._clean_interp_method(method, **kwargs)
  518. except:
  519. m = None
  520. if m is not None:
  521. r = check_int_bool(self, inplace)
  522. if r is not None:
  523. return r
  524. return self._interpolate(method=m,
  525. index=index,
  526. values=values,
  527. axis=axis,
  528. limit=limit,
  529. fill_value=fill_value,
  530. inplace=inplace,
  531. downcast=downcast,
  532. **kwargs)
  533. raise ValueError("invalid method '{0}' to interpolate.".format(method))
  534. def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
  535. limit=None, fill_value=None, coerce=False,
  536. downcast=None):
  537. """ fillna but using the interpolate machinery """
  538. # if we are coercing, then don't force the conversion
  539. # if the block can't hold the type
  540. if coerce:
  541. if not self._can_hold_na:
  542. if inplace:
  543. return [self]
  544. else:
  545. return [self.copy()]
  546. fill_value = self._try_fill(fill_value)
  547. values = self.values if inplace else self.values.copy()
  548. values = self._try_operate(values)
  549. values = com.interpolate_2d(values,
  550. method=method,
  551. axis=axis,
  552. limit=limit,
  553. fill_value=fill_value,
  554. dtype=self.dtype)
  555. values = self._try_coerce_result(values)
  556. blocks = [make_block(values,
  557. ndim=self.ndim, klass=self.__class__,
  558. fastpath=True, placement=self.mgr_locs)]
  559. return self._maybe_downcast(blocks, downcast)
  560. def _interpolate(self, method=None, index=None, values=None,
  561. fill_value=None, axis=0, limit=None,
  562. inplace=False, downcast=None, **kwargs):
  563. """ interpolate using scipy wrappers """
  564. data = self.values if inplace else self.values.copy()
  565. # only deal with floats
  566. if not self.is_float:
  567. if not self.is_integer:
  568. return self
  569. data = data.astype(np.float64)
  570. if fill_value is None:
  571. fill_value = self.fill_value
  572. if method in ('krogh', 'piecewise_polynomial', 'pchip'):
  573. if not index.is_monotonic:
  574. raise ValueError("{0} interpolation requires that the "
  575. "index be monotonic.".format(method))
  576. # process 1-d slices in the axis direction
  577. def func(x):
  578. # process a 1-d slice, returning it
  579. # should the axis argument be handled below in apply_along_axis?
  580. # i.e. not an arg to com.interpolate_1d
  581. return com.interpolate_1d(index, x, method=method, limit=limit,
  582. fill_value=fill_value,
  583. bounds_error=False, **kwargs)
  584. # interp each column independently
  585. interp_values = np.apply_along_axis(func, axis, data)
  586. blocks = [make_block(interp_values,
  587. ndim=self.ndim, klass=self.__class__,
  588. fastpath=True, placement=self.mgr_locs)]
  589. return self._maybe_downcast(blocks, downcast)
  590. def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):
  591. """
  592. Take values according to indexer and return them as a block.bb
  593. """
  594. if fill_tuple is None:
  595. fill_value = self.fill_value
  596. new_values = com.take_nd(self.get_values(), indexer, axis=axis,
  597. allow_fill=False)
  598. else:
  599. fill_value = fill_tuple[0]
  600. new_values = com.take_nd(self.get_values(), indexer, axis=axis,
  601. allow_fill=True, fill_value=fill_value)
  602. if new_mgr_locs is None:
  603. if axis == 0:
  604. slc = lib.indexer_as_slice(indexer)
  605. if slc is not None:
  606. new_mgr_locs = self.mgr_locs[slc]
  607. else:
  608. new_mgr_locs = self.mgr_locs[indexer]
  609. else:
  610. new_mgr_locs = self.mgr_locs
  611. if new_values.dtype != self.dtype:
  612. return make_block(new_values, new_mgr_locs)
  613. else:
  614. return self.make_block_same_class(new_values, new_mgr_locs)
  615. def get_values(self, dtype=None):
  616. return self.values
  617. def diff(self, n):
  618. """ return block for the diff of the values """
  619. new_values = com.diff(self.values, n, axis=1)
  620. return [make_block(values=new_values,
  621. ndim=self.ndim, fastpath=True,
  622. placement=self.mgr_locs)]
  623. def shift(self, periods, axis=0):
  624. """ shift the block by periods, possibly upcast """
  625. # convert integer to float if necessary. need to do a lot more than
  626. # that, handle boolean etc also
  627. new_values, fill_value = com._maybe_upcast(self.values)
  628. # make sure array sent to np.roll is c_contiguous
  629. f_ordered = new_values.flags.f_contiguous
  630. if f_ordered:
  631. new_values = new_values.T
  632. axis = new_values.ndim - axis - 1
  633. new_values = np.roll(new_values, periods, axis=axis)
  634. axis_indexer = [ slice(None) ] * self.ndim
  635. if periods > 0:
  636. axis_indexer[axis] = slice(None,periods)
  637. else:
  638. axis_indexer[axis] = slice(periods,None)
  639. new_values[tuple(axis_indexer)] = fill_value
  640. # restore original order
  641. if f_ordered:
  642. new_values = new_values.T
  643. return [make_block(new_values,
  644. ndim=self.ndim, fastpath=True,
  645. placement=self.mgr_locs)]
  646. def eval(self, func, other, raise_on_error=True, try_cast=False):
  647. """
  648. evaluate the block; return result block from the result
  649. Parameters
  650. ----------
  651. func : how to combine self, other
  652. other : a ndarray/object
  653. raise_on_error : if True, raise when I can't perform the function,
  654. False by default (and just return the data that we had coming in)
  655. Returns
  656. -------
  657. a new block, the result of the func
  658. """
  659. values = self.values
  660. if hasattr(other, 'reindex_axis'):
  661. other = other.values
  662. # make sure that we can broadcast
  663. is_transposed = False
  664. if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
  665. if values.ndim != other.ndim:
  666. is_transposed = True
  667. else:
  668. if values.shape == other.shape[::-1]:
  669. is_transposed = True
  670. elif values.shape[0] == other.shape[-1]:
  671. is_transposed = True
  672. else:
  673. # this is a broadcast error heree
  674. raise ValueError("cannot broadcast shape [%s] with block "
  675. "values [%s]" % (values.T.shape,
  676. other.shape))
  677. transf = (lambda x: x.T) if is_transposed else (lambda x: x)
  678. # coerce/transpose the args if needed
  679. values, other = self._try_coerce_args(transf(values), other)
  680. # get the result, may need to transpose the other
  681. def get_result(other):
  682. return self._try_coerce_result(func(values, other))
  683. # error handler if we have an issue operating with the function
  684. def handle_error():
  685. if raise_on_error:
  686. raise TypeError('Could not operate %s with block values %s'
  687. % (repr(other), str(detail)))
  688. else:
  689. # return the values
  690. result = np.empty(values.shape, dtype='O')
  691. result.fill(np.nan)
  692. return result
  693. # get the result
  694. try:
  695. result = get_result(other)
  696. # if we have an invalid shape/broadcast error
  697. # GH4576, so raise instead of allowing to pass through
  698. except ValueError as detail:
  699. raise
  700. except Exception as detail:
  701. result = handle_error()
  702. # technically a broadcast error in numpy can 'work' by returning a
  703. # boolean False
  704. if not isinstance(result, np.ndarray):
  705. if not isinstance(result, np.ndarray):
  706. # differentiate between an invalid ndarray-ndarray comparison
  707. # and an invalid type comparison
  708. if isinstance(values, np.ndarray) and is_list_like(other):
  709. raise ValueError('Invalid broadcasting comparison [%s] '
  710. 'with block values' % repr(other))
  711. raise TypeError('Could not compare [%s] with block values'
  712. % repr(other))
  713. # transpose if needed
  714. result = transf(result)
  715. # try to cast if requested
  716. if try_cast:
  717. result = self._try_cast_result(result)
  718. return [make_block(result, ndim=self.ndim,
  719. fastpath=True, placement=self.mgr_locs)]
  720. def where(self, other, cond, align=True, raise_on_error=True,
  721. try_cast=False):
  722. """
  723. evaluate the block; return result block(s) from the result
  724. Parameters
  725. ----------
  726. other : a ndarray/object
  727. cond : the condition to respect
  728. align : boolean, perform alignment on other/cond
  729. raise_on_error : if True, raise when I can't perform the function,
  730. False by default (and just return the data that we had coming in)
  731. Returns
  732. -------
  733. a new block(s), the result of the func
  734. """
  735. values = self.values
  736. # see if we can align other
  737. if hasattr(other, 'reindex_axis'):
  738. other = other.values
  739. # make sure that we can broadcast
  740. is_transposed = False
  741. if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
  742. if values.ndim != other.ndim or values.shape == other.shape[::-1]:
  743. # if its symmetric are ok, no reshaping needed (GH 7506)
  744. if (values.shape[0] == np.array(values.shape)).all():
  745. pass
  746. # pseodo broadcast (its a 2d vs 1d say and where needs it in a
  747. # specific direction)
  748. elif (other.ndim >= 1 and values.ndim - 1 == other.ndim and
  749. values.shape[0] != other.shape[0]):
  750. other = _block_shape(other).T
  751. else:
  752. values = values.T
  753. is_transposed = True
  754. # see if we can align cond
  755. if not hasattr(cond, 'shape'):
  756. raise ValueError(
  757. "where must have a condition that is ndarray like")
  758. if hasattr(cond, 'reindex_axis'):
  759. cond = cond.values
  760. # may need to undo transpose of values
  761. if hasattr(values, 'ndim'):
  762. if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
  763. values = values.T
  764. is_transposed = not is_transposed
  765. # our where function
  766. def func(c, v, o):
  767. if c.ravel().all():
  768. return v
  769. v, o = self._try_coerce_args(v, o)
  770. try:
  771. return self._try_coerce_result(
  772. expressions.where(c, v, o, raise_on_error=True)
  773. )
  774. except Exception as detail:
  775. if raise_on_error:
  776. raise TypeError('Could not operate [%s] with block values '
  777. '[%s]' % (repr(o), str(detail)))
  778. else:
  779. # return the values
  780. result = np.empty(v.shape, dtype='float64')
  781. result.fill(np.nan)
  782. return result
  783. # see if we can operate on the entire block, or need item-by-item
  784. # or if we are a single block (ndim == 1)
  785. result = func(cond, values, other)
  786. if self._can_hold_na or self.ndim == 1:
  787. if not isinstance(result, np.ndarray):
  788. raise TypeError('Could not compare [%s] with block values'
  789. % repr(other))
  790. if is_transposed:
  791. result = result.T
  792. # try to cast if requested
  793. if try_cast:
  794. result = self._try_cast_result(result)
  795. return make_block(result,
  796. ndim=self.ndim, placement=self.mgr_locs)
  797. # might need to separate out blocks
  798. axis = cond.ndim - 1
  799. cond = cond.swapaxes(axis, 0)
  800. mask = np.array([cond[i].all() for i in range(cond.shape[0])],
  801. dtype=bool)
  802. result_blocks = []
  803. for m in [mask, ~mask]:
  804. if m.any():
  805. r = self._try_cast_result(
  806. result.take(m.nonzero()[0], axis=axis))
  807. result_blocks.append(make_block(r.T,
  808. placement=self.mgr_locs[m]))
  809. return result_blocks
  810. def equals(self, other):
  811. if self.dtype != other.dtype or self.shape != other.shape: return False
  812. return np.array_equal(self.values, other.values)
  813. class NumericBlock(Block):
  814. __slots__ = ()
  815. is_numeric = True
  816. _can_hold_na = True
  817. class FloatOrComplexBlock(NumericBlock):
  818. __slots__ = ()
  819. def equals(self, other):
  820. if self.dtype != other.dtype or self.shape != other.shape: return False
  821. left, right = self.values, other.values
  822. return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
  823. class FloatBlock(FloatOrComplexBlock):
  824. __slots__ = ()
  825. is_float = True
  826. _downcast_dtype = 'int64'
  827. def _can_hold_element(self, element):
  828. if is_list_like(element):
  829. element = np.array(element)
  830. tipo = element.dtype.type
  831. return issubclass(tipo, (np.floating, np.integer)) and not issubclass(
  832. tipo, (np.datetime64, np.timedelta64))
  833. return isinstance(element, (float, int, np.float_, np.int_)) and not isinstance(
  834. element, (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64))
  835. def _try_cast(self, element):
  836. try:
  837. return float(element)
  838. except: # pragma: no cover
  839. return element
  840. def to_native_types(self, slicer=None, na_rep='', float_format=None,
  841. **kwargs):
  842. """ convert to our native types format, slicing if desired """
  843. values = self.values
  844. if slicer is not None:
  845. values = values[:, slicer]
  846. values = np.array(values, dtype=object)
  847. mask = isnull(values)
  848. values[mask] = na_rep
  849. if float_format:
  850. imask = (~mask).ravel()
  851. values.flat[imask] = np.array(
  852. [float_format % val for val in values.ravel()[imask]])
  853. return values.tolist()
  854. def should_store(self, value):
  855. # when inserting a column should not coerce integers to floats
  856. # unnecessarily
  857. return (issubclass(value.dtype.type, np.floating) and
  858. value.dtype == self.dtype)
  859. class ComplexBlock(FloatOrComplexBlock):
  860. __slots__ = ()
  861. is_complex = True
  862. def _can_hold_element(self, element):
  863. if is_list_like(element):
  864. element = np.array(element)
  865. return issubclass(element.dtype.type, (np.floating, np.integer, np.complexfloating))
  866. return (isinstance(element, (float, int, complex, np.float_, np.int_)) and
  867. not isinstance(bool, np.bool_))
  868. def _try_cast(self, element):
  869. try:
  870. return complex(element)
  871. except: # pragma: no cover
  872. return element
  873. def should_store(self, value):
  874. return issubclass(value.dtype.type, np.complexfloating)
  875. class IntBlock(NumericBlock):
  876. __slots__ = ()
  877. is_integer = True
  878. _can_hold_na = False
  879. def _can_hold_element(self, element):
  880. if is_list_like(element):
  881. element = np.array(element)
  882. tipo = element.dtype.type
  883. return issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))
  884. return com.is_integer(element)
  885. def _try_cast(self, element):
  886. try:
  887. return int(element)
  888. except: # pragma: no cover
  889. return element
  890. def should_store(self, value):
  891. return com.is_integer_dtype(value) and value.dtype == self.dtype
  892. class TimeDeltaBlock(IntBlock):
  893. __slots__ = ()
  894. is_timedelta = True
  895. _can_hold_na = True
  896. is_numeric = False
  897. @property
  898. def fill_value(self):
  899. return tslib.iNaT
  900. def _try_fill(self, value):
  901. """ if we are a NaT, return the actual fill value """
  902. if isinstance(value, type(tslib.NaT)) or np.array(isnull(value)).all():
  903. value = tslib.iNaT
  904. elif isinstance(value, np.timedelta64):
  905. pass
  906. elif com.is_integer(value):
  907. # coerce to seconds of timedelta
  908. value = np.timedelta64(int(value * 1e9))
  909. elif isinstance(value, timedelta):
  910. value = np.timedelta64(value)
  911. return value
  912. def _try_coerce_args(self, values, other):
  913. """ provide coercion to our input arguments
  914. we are going to compare vs i8, so coerce to floats
  915. repring NaT with np.nan so nans propagate
  916. values is always ndarray like, other may not be """
  917. def masker(v):
  918. mask = isnull(v)
  919. v = v.view('i8').astype('float64')
  920. v[mask] = np.nan
  921. return v
  922. values = masker(values)
  923. if _is_null_datelike_scalar(other):
  924. other = np.nan
  925. elif isinstance(other, np.timedelta64):
  926. other = _coerce_scalar_to_timedelta_type(other, unit='s').item()
  927. if other == tslib.iNaT:
  928. other = np.nan
  929. else:
  930. other = masker(other)
  931. return values, other
  932. def _try_operate(self, values):
  933. """ return a version to operate on """
  934. return values.view('i8')
  935. def _try_coerce_result(self, result):
  936. """ reverse of try_coerce_args / try_operate """
  937. if isinstance(result, np.ndarray):
  938. mask = isnull(result)
  939. if result.dtype.kind in ['i', 'f', 'O']:
  940. result = result.astype('m8[ns]')
  941. result[mask] = tslib.iNaT
  942. elif isinstance(result, np.integer):
  943. result = np.timedelta64(result)
  944. return result
  945. def should_store(self, value):
  946. return issubclass(value.dtype.type, np.timedelta64)
  947. def to_native_types(self, slicer=None, na_rep=None, **kwargs):
  948. """ convert to our native types format, slicing if desired """
  949. values = self.values
  950. if slicer is not None:
  951. values = values[:, slicer]
  952. mask = isnull(values)
  953. rvalues = np.empty(values.shape, dtype=object)
  954. if na_rep is None:
  955. na_rep = 'NaT'
  956. rvalues[mask] = na_rep
  957. imask = (~mask).ravel()
  958. rvalues.flat[imask] = np.array([lib.repr_timedelta64(val)
  959. for val in values.ravel()[imask]],
  960. dtype=object)
  961. return rvalues.tolist()
  962. class BoolBlock(NumericBlock):
  963. __slots__ = ()
  964. is_bool = True
  965. _can_hold_na = False
  966. def _can_hold_element(self, element):
  967. if is_list_like(element):
  968. element = np.array(element)
  969. return issubclass(element.dtype.type, np.integer)
  970. return isinstance(element, (int, bool))
  971. def _try_cast(self, element):
  972. try:
  973. return bool(element)
  974. except: # pragma: no cover
  975. return element
  976. def should_store(self, value):
  977. return issubclass(value.dtype.type, np.bool_)
  978. def replace(self, to_replace, value, inplace=False, filter=None,
  979. regex=False):
  980. to_replace_values = np.atleast_1d(to_replace)
  981. if not np.can_cast(to_replace_values, bool):
  982. return self
  983. return super(BoolBlock, self).replace(to_replace, value,
  984. inplace=inplace, filter=filter,
  985. regex=regex)
  986. class ObjectBlock(Block):
  987. __slots__ = ()
  988. is_object = True
  989. _can_hold_na = True
  990. def __init__(self, values, ndim=2, fastpath=False,
  991. placement=None):
  992. if issubclass(values.dtype.type, compat.string_types):
  993. values = np.array(values, dtype=object)
  994. super(ObjectBlock, self).__init__(values, ndim=ndim,
  995. fastpath=fastpath,
  996. placement=placement)
  997. @property
  998. def is_bool(self):
  999. """ we can be a bool if we have only bool values but are of type
  1000. object
  1001. """
  1002. return lib.is_bool_array(self.values.ravel())
  1003. def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=True,
  1004. copy=True, by_item=True):
  1005. """ attempt to coerce any object types to better types
  1006. return a copy of the block (if copy = True)
  1007. by definition we ARE an ObjectBlock!!!!!
  1008. can return multiple blocks!
  1009. """
  1010. # attempt to create new type blocks
  1011. blocks = []
  1012. if by_item and not self._is_single_block:
  1013. for i, rl in enumerate(self.mgr_locs):
  1014. values = self.iget(i)
  1015. values = com._possibly_convert_objects(
  1016. values.ravel(), convert_dates=convert_dates,
  1017. convert_numeric=convert_numeric,
  1018. convert_timedeltas=convert_timedeltas,
  1019. ).reshape(values.shape)
  1020. values = _block_shape(values, ndim=self.ndim)
  1021. newb = make_block(values,
  1022. ndim=self.ndim, placement=[rl])
  1023. blocks.append(newb)
  1024. else:
  1025. values = com._possibly_convert_objects(
  1026. self.values.ravel(), convert_dates=convert_dates,
  1027. convert_numeric=convert_numeric
  1028. ).reshape(self.values.shape)
  1029. blocks.append(make_block(values,
  1030. ndim=self.ndim, placement=self.mgr_locs))
  1031. return blocks
  1032. def set(self, locs, values, check=False):
  1033. """
  1034. Modify Block in-place with new item value
  1035. Returns
  1036. -------
  1037. None
  1038. """
  1039. # GH6026
  1040. if check:
  1041. try:
  1042. if (self.values[locs] == values).all():
  1043. return
  1044. except:
  1045. pass
  1046. try:
  1047. self.values[locs] = values
  1048. except (ValueError):
  1049. # broadcasting error
  1050. # see GH6171
  1051. new_shape = list(values.shape)
  1052. new_shape[0] = len(self.items)
  1053. self.values = np.empty(tuple(new_shape),dtype=self.dtype)
  1054. self.values.fill(np.nan)
  1055. self.values[locs] = values
  1056. def _maybe_downcast(self, blocks, downcast=None):
  1057. if downcast is not None:
  1058. return blocks
  1059. # split and convert the blocks
  1060. result_blocks = []
  1061. for blk in blocks:
  1062. result_blocks.extend(blk.convert(convert_dates=True,
  1063. convert_numeric=False))
  1064. return result_blocks
  1065. def _can_hold_element(self, element):
  1066. return True
  1067. def _try_cast(self, element):
  1068. return element
  1069. def should_store(self, value):
  1070. return not issubclass(value.dtype.type,
  1071. (np.integer, np.floating, np.complexfloating,
  1072. np.datetime64, np.bool_))
  1073. def replace(self, to_replace, value, inplace=False, filter=None,
  1074. regex=False):
  1075. blk = [self]
  1076. to_rep_is_list = com.is_list_like(to_replace)
  1077. value_is_list = com.is_list_like(value)
  1078. both_lists = to_rep_is_list and value_is_list
  1079. either_list = to_rep_is_list or value_is_list
  1080. if not either_list and com.is_re(to_replace):
  1081. blk[0], = blk[0]._replace_single(to_replace, value,
  1082. inplace=inplace, filter=filter,
  1083. regex=True)
  1084. elif not (either_list or regex):
  1085. blk = super(ObjectBlock, self).replace(to_replace, value,
  1086. inplace=inplace,
  1087. filter=filter, regex=regex)
  1088. elif both_lists:
  1089. for to_rep, v in zip(to_replace, value):
  1090. blk[0], = blk[0]._replace_single(to_rep, v, inplace=inplace,
  1091. filter=filter, regex=regex)
  1092. elif to_rep_is_list and regex:
  1093. for to_rep in to_replace:
  1094. blk[0], = blk[0]._replace_single(to_rep, value,
  1095. inplace=inplace,
  1096. filter=filter, regex=regex)
  1097. else:
  1098. blk[0], = blk[0]._replace_single(to_replace, value,
  1099. inplace=inplace, filter=filter,
  1100. regex=regex)
  1101. return blk
  1102. def _replace_single(self, to_replace, value, inplace=False, filter=None,
  1103. regex=False):
  1104. # to_replace is regex compilable
  1105. to_rep_re = regex and com.is_re_compilable(to_replace)
  1106. # regex is regex compilable
  1107. regex_re = com.is_re_compilable(regex)
  1108. # only one will survive
  1109. if to_rep_re and regex_re:
  1110. raise AssertionError('only one of to_replace and regex can be '
  1111. 'regex compilable')
  1112. # if regex was passed as something that can be a regex (rather than a
  1113. # boolean)
  1114. if regex_re:
  1115. to_replace = regex
  1116. regex = regex_re or to_rep_re
  1117. # try to get the pattern attribute (compiled re) or it's a string
  1118. try:
  1119. pattern = to_replace.pattern
  1120. except AttributeError:
  1121. pattern = to_replace
  1122. # if the pattern is not empty and to_replace is either a string or a
  1123. # regex
  1124. if regex and pattern:
  1125. rx = re.compile(to_replace)
  1126. else:
  1127. # if the thing to replace is not a string or compiled regex call
  1128. # the superclass method -> to_replace is some kind of object
  1129. result = super(ObjectBlock, self).replace(to_replace, value,
  1130. inplace=inplace,
  1131. filter=filter,
  1132. regex=regex)
  1133. if not isinstance(result, list):
  1134. result = [result]
  1135. return result
  1136. new_values = self.values if inplace else self.values.copy()
  1137. # deal with replacing values with objects (strings) that match but
  1138. # whose replacement is not a string (numeric, nan, object)
  1139. if isnull(value) or not isinstance(value, compat.string_types):
  1140. def re_replacer(s):
  1141. try:
  1142. return value if rx.search(s) is not None else s
  1143. except TypeError:
  1144. return s
  1145. else:
  1146. # value is guaranteed to be a string here, s can be either a string
  1147. # or null if it's null it gets returned
  1148. def re_replacer(s):
  1149. try:
  1150. return rx.sub(value, s)
  1151. except TypeError:
  1152. return s
  1153. f = np.vectorize(re_replacer, otypes=[self.dtype])
  1154. if filter is None:
  1155. filt = slice(None)
  1156. else:
  1157. filt = self.mgr_locs.isin(filter).nonzero()[0]
  1158. new_values[filt] = f(new_values[filt])
  1159. return [self if inplace else
  1160. make_block(new_values,
  1161. fastpath=True, placement=self.mgr_locs)]
  1162. class DatetimeBlock(Block):
  1163. __slots__ = ()
  1164. is_datetime = True
  1165. _can_hold_na = True
  1166. def __init__(self, values, placement,
  1167. fastpath=False, **kwargs):
  1168. if values.dtype != _NS_DTYPE:
  1169. values = tslib.cast_to_nanoseconds(values)
  1170. super(DatetimeBlock, self).__init__(values,
  1171. fastpath=True, placement=placement,
  1172. **kwargs)
  1173. def _can_hold_element(self, element):
  1174. if is_list_like(element):
  1175. element = np.array(element)
  1176. return element.dtype == _NS_DTYPE or element.dtype == np.int64
  1177. return (com.is_integer(element) or
  1178. isinstance(element, datetime) or
  1179. isnull(element))
  1180. def _try_cast(self, element):
  1181. try:
  1182. return int(element)
  1183. except:
  1184. return element
  1185. def _try_operate(self, values):
  1186. """ return a version to operate on """
  1187. return values.view('i8')
  1188. def _try_coerce_args(self, values, other):
  1189. """ provide coercion to our input arguments
  1190. we are going to compare vs i8, so coerce to integer
  1191. values is always ndarra like, other may not be """
  1192. values = values.view('i8')
  1193. if _is_null_datelike_scalar(other):
  1194. other = tslib.iNaT
  1195. elif isinstance(other, datetime):
  1196. other = lib.Timestamp(other).asm8.view('i8')
  1197. else:
  1198. other = other.view('i8')
  1199. return values, other
  1200. def _try_coerce_result(self, result):
  1201. """ reverse of try_coerce_args """
  1202. if isinstance(result, np.ndarray):
  1203. if result.dtype == 'i8':
  1204. result = tslib.array_to_datetime(
  1205. result.astype(object).ravel()).reshape(result.shape)
  1206. elif result.dtype.kind in ['i', 'f', 'O']:
  1207. result = result.astype('M8[ns]')
  1208. elif isinstance(result, (np.integer, np.datetime64)):
  1209. result = lib.Timestamp(result)
  1210. return result
  1211. @property
  1212. def fill_value(self):
  1213. return tslib.iNaT
  1214. def _try_fill(self, value):
  1215. """ if we are a NaT, return the actual fill value """
  1216. if isinstance(value, type(tslib.NaT)) or np.array(isnull(value)).all():
  1217. value = tslib.iNaT
  1218. return value
  1219. def fillna(self, value, limit=None,
  1220. inplace=False, downcast=None):
  1221. # straight putmask here
  1222. values = self.values if inplace else self.values.copy()
  1223. mask = isnull(self.values)
  1224. value = self._try_fill(value)
  1225. if limit is not None:
  1226. if self.ndim > 2:
  1227. raise NotImplementedError
  1228. mask[mask.cumsum(self.ndim-1)>limit]=False
  1229. np.putmask(values, mask, value)
  1230. return [self if inplace else
  1231. make_block(values,
  1232. fastpath=True, placement=self.mgr_locs)]
  1233. def to_native_types(self, slicer=None, na_rep=None, date_format=None,
  1234. **kwargs):
  1235. """ convert to our native types format, slicing if desired """
  1236. values = self.values
  1237. if slicer is not None:
  1238. values = values[:, slicer]
  1239. mask = isnull(values)
  1240. rvalues = np.empty(values.shape, dtype=object)
  1241. if na_rep is None:
  1242. na_rep = 'NaT'
  1243. rvalues[mask] = na_rep
  1244. imask = (~mask).ravel()
  1245. if date_format is None:
  1246. date_formatter = lambda x: Timestamp(x)._repr_base
  1247. else:
  1248. date_formatter = lambda x: Timestamp(x).strftime(date_format)
  1249. rvalues.flat[imask] = np.array([date_formatter(val) for val in
  1250. values.ravel()[imask]], dtype=object)
  1251. return rvalues.tolist()
  1252. def should_store(self, value):
  1253. return issubclass(value.dtype.type, np.datetime64)
  1254. def astype(self, dtype, copy=False, raise_on_error=True):
  1255. """
  1256. handle convert to object as a special case
  1257. """
  1258. klass = None
  1259. if np.dtype(dtype).type == np.object_:
  1260. klass = ObjectBlock
  1261. return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
  1262. klass=klass)
  1263. def set(self, locs, values, check=False):
  1264. """
  1265. Modify Block in-place with new item value
  1266. Returns
  1267. -------
  1268. None
  1269. """
  1270. if values.dtype != _NS_DTYPE:
  1271. # Workaround for numpy 1.6 bug
  1272. values = tslib.cast_to_nanoseconds(values)
  1273. self.values[locs] = values
  1274. def get_values(self, dtype=None):
  1275. # return object dtype as Timestamps
  1276. if dtype == object:
  1277. return lib.map_infer(self.values.ravel(), lib.Timestamp)\
  1278. .reshape(self.values.shape)
  1279. return self.values
  1280. class SparseBlock(Block):
  1281. """ implement as a list of sparse arrays of the same dtype """
  1282. __slots__ = ()
  1283. is_sparse = True
  1284. is_numeric = True
  1285. _can_hold_na = True
  1286. _can_consolidate = False
  1287. _verify_integrity = False
  1288. _ftype = 'sparse'
  1289. def __init__(self, values, placement,
  1290. ndim=None, fastpath=False,):
  1291. # kludgetastic
  1292. if ndim is None:
  1293. if len(placement) != 1:
  1294. ndim = 1
  1295. else:
  1296. ndim = 2
  1297. self.ndim = ndim
  1298. self.mgr_locs = placement
  1299. if not isinstance(values, SparseArray):
  1300. raise TypeError("values must be SparseArray")
  1301. self.values = values
  1302. @property
  1303. def shape(self):
  1304. return (len(self.mgr_locs), self.sp_index.length)
  1305. @property
  1306. def itemsize(self):
  1307. return self.dtype.itemsize
  1308. @property
  1309. def fill_value(self):
  1310. #return np.nan
  1311. return self.values.fill_value
  1312. @fill_value.setter
  1313. def fill_value(self, v):
  1314. # we may need to upcast our fill to match our dtype
  1315. if issubclass(self.dtype.type, np.floating):
  1316. v = float(v)
  1317. self.values.fill_value = v
  1318. @property
  1319. def sp_values(self):
  1320. return self.values.sp_values
  1321. @sp_values.setter
  1322. def sp_values(self, v):
  1323. # reset the sparse values
  1324. self.values = SparseArray(v, sparse_index=self.sp_index,
  1325. kind=self.kind, dtype=v.dtype,
  1326. fill_value=self.values.fill_value,
  1327. copy=False)
  1328. def iget(self, col):
  1329. if col != 0:
  1330. raise IndexError("SparseBlock only contains one item")
  1331. return self.values
  1332. @property
  1333. def sp_index(self):
  1334. return self.values.sp_index
  1335. @property
  1336. def kind(self):
  1337. return self.values.kind
  1338. def __len__(self):
  1339. try:
  1340. return self.sp_index.length
  1341. except:
  1342. return 0
  1343. def should_store(self, value):
  1344. return isinstance(value, SparseArray)
  1345. def set(self, locs, values, check=False):
  1346. assert locs.tolist() == [0]
  1347. self.values = values
  1348. def get(self, item):
  1349. if self.ndim == 1:
  1350. loc = self.items.get_loc(item)
  1351. return self.values[loc]
  1352. else:
  1353. return self.values
  1354. def _slice(self, slicer):
  1355. """ return a slice of my values (but densify first) """
  1356. return self.get_values()[slicer]
  1357. def get_values(self, dtype=None):
  1358. """ need to to_dense myself (and always return a ndim sized object) """
  1359. values = self.values.to_dense()
  1360. if values.ndim == self.ndim - 1:
  1361. values = values.reshape((1,) + values.shape)
  1362. return values
  1363. def copy(self, deep=True):
  1364. return self.make_block_same_class(values=self.values,
  1365. sparse_index=self.sp_index,
  1366. kind=self.kind, copy=deep,
  1367. placement=self.mgr_locs)
  1368. def make_block_same_class(self, values, placement,
  1369. sparse_index=None, kind=None, dtype=None,
  1370. fill_value=None, copy=False, fastpath=True):
  1371. """ return a new block """
  1372. if dtype is None:
  1373. dtype = self.dtype
  1374. if fill_value is None:
  1375. fill_value = self.values.fill_value
  1376. # if not isinstance(values, SparseArray) and values.ndim != self.ndim:
  1377. # raise ValueError("ndim mismatch")
  1378. if values.ndim == 2:
  1379. nitems = values.shape[0]
  1380. if nitems == 0:
  1381. # kludgy, but SparseBlocks cannot handle slices, where the
  1382. # output is 0-item, so let's convert it to a dense block: it
  1383. # won't take space since there's 0 items, plus it will preserve
  1384. # the dtype.
  1385. return make_block(np.empty(values.shape, dtype=dtype),
  1386. placement, fastpath=True,)
  1387. elif nitems > 1:
  1388. raise ValueError("Only 1-item 2d sparse blocks are supported")
  1389. else:
  1390. values = values.reshape(values.shape[1])
  1391. new_values = SparseArray(values, sparse_index=sparse_index,
  1392. kind=kind or self.kind, dtype=dtype,
  1393. fill_value=fill_value, copy=copy)
  1394. return make_block(new_values, ndim=self.ndim,
  1395. fastpath=fastpath, placement=placement)
  1396. def interpolate(self, method='pad', axis=0, inplace=False,
  1397. limit=None, fill_value=None, **kwargs):
  1398. values = com.interpolate_2d(
  1399. self.values.to_dense(), method, axis, limit, fill_value)
  1400. return self.make_block_same_class(values=values,
  1401. placement=self.mgr_locs)
  1402. def fillna(self, value, limit=None, inplace=False, downcast=None):
  1403. # we may need to upcast our fill to match our dtype
  1404. if limit is not None:
  1405. raise NotImplementedError
  1406. if issubclass(self.dtype.type, np.floating):
  1407. value = float(value)
  1408. values = self.values if inplace else self.values.copy()
  1409. return [self.make_block_same_class(values=values.get_values(value),
  1410. fill_value=value,
  1411. placement=self.mgr_locs)]
  1412. def shift(self, periods, axis=0):
  1413. """ shift the block by periods """
  1414. N = len(self.values.T)
  1415. indexer = np.zeros(N, dtype=int)
  1416. if periods > 0:
  1417. indexer[periods:] = np.arange(N - periods)
  1418. else:
  1419. indexer[:periods] = np.arange(-periods, N)
  1420. new_values = self.values.to_dense().take(indexer)
  1421. # convert integer to float if necessary. need to do a lot more than
  1422. # that, handle boolean etc also
  1423. new_values, fill_value = com._maybe_upcast(new_values)
  1424. if periods > 0:
  1425. new_values[:periods] = fill_value
  1426. else:
  1427. new_values[periods:] = fill_value
  1428. return [self.make_block_same_class(new_values, placement=self.mgr_locs)]
  1429. def reindex_axis(self, indexer, method=None, axis=1, fill_value=None,
  1430. limit=None, mask_info=None):
  1431. """
  1432. Reindex using pre-computed indexer information
  1433. """
  1434. if axis < 1:
  1435. raise AssertionError('axis must be at least 1, got %d' % axis)
  1436. # taking on the 0th axis always here
  1437. if fill_value is None:
  1438. fill_value = self.fill_value
  1439. return self.make_block_same_class(self.values.take(indexer),
  1440. fill_value=fill_value,
  1441. placement=self.mgr_locs)
  1442. def sparse_reindex(self, new_index):
  1443. """ sparse reindex and return a new block
  1444. current reindex only works for float64 dtype! """
  1445. values = self.values
  1446. values = values.sp_index.to_int_index().reindex(
  1447. values.sp_values.astype('float64'), values.fill_value, new_index)
  1448. return self.make_block_same_class(values, sparse_index=new_index,
  1449. placement=self.mgr_locs)
  1450. def _try_cast_result(self, result, dtype=None):
  1451. return result
  1452. def make_block(values, placement, klass=None, ndim=None,
  1453. dtype=None, fastpath=False):
  1454. if klass is None:
  1455. dtype = dtype or values.dtype
  1456. vtype = dtype.type
  1457. if isinstance(values, SparseArray):
  1458. klass = SparseBlock
  1459. elif issubclass(vtype, np.floating):
  1460. klass = FloatBlock
  1461. elif (issubclass(vtype, np.integer) and
  1462. issubclass(vtype, np.timedelta64)):
  1463. klass = TimeDeltaBlock
  1464. elif (issubclass(vtype, np.integer) and
  1465. not issubclass(vtype, np.datetime64)):
  1466. klass = IntBlock
  1467. elif dtype == np.bool_:
  1468. klass = BoolBlock
  1469. elif issubclass(vtype, np.datetime64):
  1470. klass = DatetimeBlock
  1471. elif issubclass(vtype, np.complexfloating):
  1472. klass = ComplexBlock
  1473. else:
  1474. # we want to infer here if its a datetimelike if its object type
  1475. # this is pretty strict in that it requires a datetime/timedelta
  1476. # value IN addition to possible nulls/strings
  1477. # an array of ONLY strings will not be inferred
  1478. if np.prod(values.shape):
  1479. result = _possibly_infer_to_datetimelike(values)
  1480. vtype = result.dtype.type
  1481. if issubclass(vtype, np.datetime64):
  1482. klass = DatetimeBlock
  1483. values = result
  1484. elif (issubclass(vtype, np.timedelta64)):
  1485. klass = TimeDeltaBlock
  1486. values = result
  1487. if klass is None:
  1488. klass = ObjectBlock
  1489. return klass(values, ndim=ndim, fastpath=fastpath,
  1490. placement=placement)
  1491. # TODO: flexible with index=None and/or items=None
  1492. class BlockManager(PandasObject):
  1493. """
  1494. Core internal data structure to implement DataFrame
  1495. Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
  1496. lightweight blocked set of labeled data to be manipulated by the DataFrame
  1497. public API class
  1498. Attributes
  1499. ----------
  1500. shape
  1501. ndim
  1502. axes
  1503. values
  1504. items
  1505. Methods
  1506. -------
  1507. set_axis(axis, new_labels)
  1508. copy(deep=True)
  1509. get_dtype_counts
  1510. get_ftype_counts
  1511. get_dtypes
  1512. get_ftypes
  1513. apply(func, axes, block_filter_fn)
  1514. get_bool_data
  1515. get_numeric_data
  1516. get_slice(slice_like, axis)
  1517. get(label)
  1518. iget(loc)
  1519. get_scalar(label_tup)
  1520. take(indexer, axis)
  1521. reindex_axis(new_labels, axis)
  1522. reindex_indexer(new_labels, indexer, axis)
  1523. delete(label)
  1524. insert(loc, label, value)
  1525. set(label, value)
  1526. Parameters
  1527. ----------
  1528. Notes
  1529. -----
  1530. This is *not* a public API class
  1531. """
  1532. __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated',
  1533. '_is_consolidated', '_blknos', '_blklocs']
  1534. def __init__(self, blocks, axes, do_integrity_check=True, fastpath=True):
  1535. self.axes = [_ensure_index(ax) for ax in axes]
  1536. self.blocks = tuple(blocks)
  1537. for block in blocks:
  1538. if block.is_sparse:
  1539. if len(block.mgr_locs) != 1:
  1540. raise AssertionError("Sparse block refers to multiple items")
  1541. else:
  1542. if self.ndim != block.ndim:
  1543. raise AssertionError(('Number of Block dimensions (%d) must '
  1544. 'equal number of axes (%d)')
  1545. % (block.ndim, self.ndim))
  1546. if do_integrity_check:
  1547. self._verify_integrity()
  1548. self._consolidate_check()
  1549. self._rebuild_blknos_and_blklocs()
  1550. def make_empty(self, axes=None):
  1551. """ return an empty BlockManager with the items axis of len 0 """
  1552. if axes is None:
  1553. axes = [_ensure_index([])] + [
  1554. _ensure_index(a) for a in self.axes[1:]
  1555. ]
  1556. # preserve dtype if possible
  1557. if self.ndim == 1:
  1558. blocks = np.array([], dtype=self.dtype)
  1559. else:
  1560. blocks = []
  1561. return self.__class__(blocks, axes)
  1562. def __nonzero__(self):
  1563. return True
  1564. # Python3 compat
  1565. __bool__ = __nonzero__
  1566. @property
  1567. def shape(self):
  1568. return tuple(len(ax) for ax in self.axes)
  1569. @property
  1570. def ndim(self):
  1571. return len(self.axes)
  1572. def set_axis(self, axis, new_labels):
  1573. new_labels = _ensure_index(new_labels)
  1574. old_len = len(self.axes[axis])
  1575. new_len = len(new_labels)
  1576. if new_len != old_len:
  1577. raise ValueError('Length mismatch: Expected axis has %d elements, '
  1578. 'new values have %d elements' % (old_len, new_len))
  1579. self.axes[axis] = new_labels
  1580. def rename_axis(self, mapper, axis, copy=True):
  1581. """
  1582. Rename one of axes.
  1583. Parameters
  1584. ----------
  1585. mapper : unary callable
  1586. axis : int
  1587. copy : boolean, default True
  1588. """
  1589. obj = self.copy(deep=copy)
  1590. obj.set_axis(axis, _transform_index(self.axes[axis], mapper))
  1591. return obj
  1592. def add_prefix(self, prefix):
  1593. f = (str(prefix) + '%s').__mod__
  1594. return self.rename_axis(f, axis=0)
  1595. def add_suffix(self, suffix):
  1596. f = ('%s' + str(suffix)).__mod__
  1597. return self.rename_axis(f, axis=0)
  1598. @property
  1599. def _is_single_block(self):
  1600. if self.ndim == 1:
  1601. return True
  1602. if len(self.blocks) != 1:
  1603. return False
  1604. blk = self.blocks[0]
  1605. return (blk.mgr_locs.is_slice_like and
  1606. blk.mgr_locs.as_slice == slice(0, len(self), 1))
  1607. def _rebuild_blknos_and_blklocs(self):
  1608. """
  1609. Update mgr._blknos / mgr._blklocs.
  1610. """
  1611. new_blknos = np.empty(self.shape[0], dtype=np.int64)
  1612. new_blklocs = np.empty(self.shape[0], dtype=np.int64)
  1613. new_blknos.fill(-1)
  1614. new_blklocs.fill(-1)
  1615. for blkno, blk in enumerate(self.blocks):
  1616. rl = blk.mgr_locs
  1617. new_blknos[rl.indexer] = blkno
  1618. new_blklocs[rl.indexer] = np.arange(len(rl))
  1619. if (new_blknos == -1).any():
  1620. raise AssertionError("Gaps in blk ref_locs")
  1621. self._blknos = new_blknos
  1622. self._blklocs = new_blklocs
  1623. # make items read only for now
  1624. def _get_items(self):
  1625. return self.axes[0]
  1626. items = property(fget=_get_items)
  1627. def _get_counts(self, f):
  1628. """ return a dict of the counts of the function in BlockManager """
  1629. self._consolidate_inplace()
  1630. counts = dict()
  1631. for b in self.blocks:
  1632. v = f(b)
  1633. counts[v] = counts.get(v, 0) + b.shape[0]
  1634. return counts
  1635. def get_dtype_counts(self):
  1636. return self._get_counts(lambda b: b.dtype.name)
  1637. def get_ftype_counts(self):
  1638. return self._get_counts(lambda b: b.ftype)
  1639. def get_dtypes(self):
  1640. dtypes = np.array([blk.dtype for blk in self.blocks])
  1641. return com.take_1d(dtypes, self._blknos, allow_fill=False)
  1642. def get_ftypes(self):
  1643. ftypes = np.array([blk.ftype for blk in self.blocks])
  1644. return com.take_1d(ftypes, self._blknos, allow_fill=False)
  1645. def __getstate__(self):
  1646. block_values = [b.values for b in self.blocks]
  1647. block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
  1648. axes_array = [ax for ax in self.axes]
  1649. return axes_array, block_values, block_items
  1650. def __setstate__(self, state):
  1651. # discard anything after 3rd, support beta pickling format for a little
  1652. # while longer
  1653. ax_arrays, bvalues, bitems = state[:3]
  1654. self.axes = [_ensure_index(ax) for ax in ax_arrays]
  1655. blocks = []
  1656. for values, items in zip(bvalues, bitems):
  1657. # numpy < 1.7 pickle compat
  1658. if values.dtype == 'M8[us]':
  1659. values = values.astype('M8[ns]')
  1660. blk = make_block(values,
  1661. placement=self.axes[0].get_indexer(items))
  1662. blocks.append(blk)
  1663. self.blocks = tuple(blocks)
  1664. self._post_setstate()
  1665. def _post_setstate(self):
  1666. self._is_consolidated = False
  1667. self._known_consolidated = False
  1668. self._rebuild_blknos_and_blklocs()
  1669. def __len__(self):
  1670. return len(self.items)
  1671. def __unicode__(self):
  1672. output = com.pprint_thing(self.__class__.__name__)
  1673. for i, ax in enumerate(self.axes):
  1674. if i == 0:
  1675. output += u('\nItems: %s') % ax
  1676. else:
  1677. output += u('\nAxis %d: %s') % (i, ax)
  1678. for block in self.blocks:
  1679. output += u('\n%s') % com.pprint_thing(block)
  1680. return output
  1681. def _verify_integrity(self):
  1682. mgr_shape = self.shape
  1683. tot_items = sum(len(x.mgr_locs) for x in self.blocks)
  1684. for block in self.blocks:
  1685. if not block.is_sparse and block.shape[1:] != mgr_shape[1:]:
  1686. construction_error(tot_items, block.shape[1:], self.axes)
  1687. if len(self.items) != tot_items:
  1688. raise AssertionError('Number of manager items must equal union of '
  1689. 'block items\n# manager items: {0}, # '
  1690. 'tot_items: {1}'.format(len(self.items),
  1691. tot_items))
  1692. def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs):
  1693. """
  1694. iterate over the blocks, collect and create a new block manager
  1695. Parameters
  1696. ----------
  1697. f : the callable or function name to operate on at the block level
  1698. axes : optional (if not supplied, use self.axes)
  1699. filter : list, if supplied, only call the block if the filter is in
  1700. the block
  1701. do_integrity_check : boolean, default False. Do the block manager integrity check
  1702. Returns
  1703. -------
  1704. Block Manager (new object)
  1705. """
  1706. result_blocks = []
  1707. # filter kwarg is used in replace-* family of methods
  1708. if filter is not None:
  1709. filter_locs = set(self.items.get_indexer_for(filter))
  1710. if len(filter_locs) == len(self.items):
  1711. # All items are included, as if there were no filtering
  1712. filter = None
  1713. else:
  1714. kwargs['filter'] = filter_locs
  1715. if f == 'where' and kwargs.get('align', True):
  1716. align_copy = True
  1717. align_keys = ['other', 'cond']
  1718. elif f == 'putmask' and kwargs.get('align', True):
  1719. align_copy = False
  1720. align_keys = ['new', 'mask']
  1721. elif f == 'eval':
  1722. align_copy = False
  1723. align_keys = ['other']
  1724. elif f == 'fillna':
  1725. # fillna internally does putmask, maybe it's better to do this
  1726. # at mgr, not block level?
  1727. align_copy = False
  1728. align_keys = ['value']
  1729. else:
  1730. align_keys = []
  1731. aligned_args = dict((k, kwargs[k]) for k in align_keys
  1732. if hasattr(kwargs[k], 'reindex_axis'))
  1733. for b in self.blocks:
  1734. if filter is not None:
  1735. if not b.mgr_locs.isin(filter_locs).any():
  1736. result_blocks.append(b)
  1737. continue
  1738. if aligned_args:
  1739. b_items = self.items[b.mgr_locs.indexer]
  1740. for k, obj in aligned_args.items():
  1741. axis = getattr(obj, '_info_axis_number', 0)
  1742. kwargs[k] = obj.reindex_axis(b_items, axis=axis,
  1743. copy=align_copy)
  1744. applied = getattr(b, f)(**kwargs)
  1745. if isinstance(applied, list):
  1746. result_blocks.extend(applied)
  1747. else:
  1748. result_blocks.append(applied)
  1749. if len(result_blocks) == 0:
  1750. return self.make_empty(axes or self.axes)
  1751. bm = self.__class__(result_blocks, axes or self.axes,
  1752. do_integrity_check=do_integrity_check)
  1753. bm._consolidate_inplace()
  1754. return bm
  1755. def isnull(self, **kwargs):
  1756. return self.apply('apply', **kwargs)
  1757. def where(self, **kwargs):
  1758. return self.apply('where', **kwargs)
  1759. def eval(self, **kwargs):
  1760. return self.apply('eval', **kwargs)
  1761. def setitem(self, **kwargs):
  1762. return self.apply('setitem', **kwargs)
  1763. def putmask(self, **kwargs):
  1764. return self.apply('putmask', **kwargs)
  1765. def diff(self, **kwargs):
  1766. return self.apply('diff', **kwargs)
  1767. def interpolate(self, **kwargs):
  1768. return self.apply('interpolate', **kwargs)
  1769. def shift(self, **kwargs):
  1770. return self.apply('shift', **kwargs)
  1771. def fillna(self, **kwargs):
  1772. return self.apply('fillna', **kwargs)
  1773. def downcast(self, **kwargs):
  1774. return self.apply('downcast', **kwargs)
  1775. def astype(self, dtype, **kwargs):
  1776. return self.apply('astype', dtype=dtype, **kwargs)
  1777. def convert(self, **kwargs):
  1778. return self.apply('convert', **kwargs)
  1779. def replace(self, **kwargs):
  1780. return self.apply('replace', **kwargs)
  1781. def replace_list(self, src_list, dest_list, inplace=False, regex=False):
  1782. """ do a list replace """
  1783. # figure out our mask a-priori to avoid repeated replacements
  1784. values = self.as_matrix()
  1785. def comp(s):
  1786. if isnull(s):
  1787. return isnull(values)
  1788. return _possibly_compare(values, getattr(s, 'asm8', s),
  1789. operator.eq)
  1790. masks = [comp(s) for i, s in enumerate(src_list)]
  1791. result_blocks = []
  1792. for blk in self.blocks:
  1793. # its possible to get multiple result blocks here
  1794. # replace ALWAYS will return a list
  1795. rb = [blk if inplace else blk.copy()]
  1796. for i, (s, d) in enumerate(zip(src_list, dest_list)):
  1797. new_rb = []
  1798. for b in rb:
  1799. if b.dtype == np.object_:
  1800. result = b.replace(s, d, inplace=inplace,
  1801. regex=regex)
  1802. if isinstance(result, list):
  1803. new_rb.extend(result)
  1804. else:
  1805. new_rb.append(result)
  1806. else:
  1807. # get our mask for this element, sized to this
  1808. # particular block
  1809. m = masks[i][b.mgr_locs.indexer]
  1810. if m.any():
  1811. new_rb.extend(b.putmask(m, d, inplace=True))
  1812. else:
  1813. new_rb.append(b)
  1814. rb = new_rb
  1815. result_blocks.extend(rb)
  1816. bm = self.__class__(result_blocks, self.axes)
  1817. bm._consolidate_inplace()
  1818. return bm
  1819. def is_consolidated(self):
  1820. """
  1821. Return True if more than one block with the same dtype
  1822. """
  1823. if not self._known_consolidated:
  1824. self._consolidate_check()
  1825. return self._is_consolidated
  1826. def _consolidate_check(self):
  1827. ftypes = [blk.ftype for blk in self.blocks]
  1828. self._is_consolidated = len(ftypes) == len(set(ftypes))
  1829. self._known_consolidated = True
  1830. @property
  1831. def is_mixed_type(self):
  1832. # Warning, consolidation needs to get checked upstairs
  1833. self._consolidate_inplace()
  1834. return len(self.blocks) > 1
  1835. @property
  1836. def is_numeric_mixed_type(self):
  1837. # Warning, consolidation needs to get checked upstairs
  1838. self._consolidate_inplace()
  1839. return all([block.is_numeric for block in self.blocks])
  1840. @property
  1841. def is_datelike_mixed_type(self):
  1842. # Warning, consolidation needs to get checked upstairs
  1843. self._consolidate_inplace()
  1844. return any([block.is_datelike for block in self.blocks])
  1845. @property
  1846. def is_view(self):
  1847. """ return a boolean if we are a single block and are a view """
  1848. if len(self.blocks) == 1:
  1849. return self.blocks[0].values.base is not None
  1850. return False
  1851. def get_bool_data(self, copy=False):
  1852. """
  1853. Parameters
  1854. ----------
  1855. copy : boolean, default False
  1856. Whether to copy the blocks
  1857. """
  1858. self._consolidate_inplace()
  1859. return self.combine([b for b in self.blocks if b.is_bool], copy)
  1860. def get_numeric_data(self, copy=False):
  1861. """
  1862. Parameters
  1863. ----------
  1864. copy : boolean, default False
  1865. Whether to copy the blocks
  1866. """
  1867. self._consolidate_inplace()
  1868. return self.combine([b for b in self.blocks if b.is_numeric], copy)
  1869. def combine(self, blocks, copy=True):
  1870. """ return a new manager with the blocks """
  1871. if len(blocks) == 0:
  1872. return self.make_empty()
  1873. # FIXME: optimization potential
  1874. indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
  1875. inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
  1876. new_items = self.items.take(indexer)
  1877. new_blocks = []
  1878. for b in blocks:
  1879. b = b.copy(deep=copy)
  1880. b.mgr_locs = com.take_1d(inv_indexer, b.mgr_locs.as_array, axis=0,
  1881. allow_fill=False)
  1882. new_blocks.append(b)
  1883. new_axes = list(self.axes)
  1884. new_axes[0] = new_items
  1885. return self.__class__(new_blocks, new_axes, do_integrity_check=False)
  1886. def get_slice(self, slobj, axis=0):
  1887. if axis >= self.ndim:
  1888. raise IndexError("Requested axis not found in manager")
  1889. if axis == 0:
  1890. new_blocks = self._slice_take_blocks_ax0(slobj)
  1891. else:
  1892. slicer = [slice(None)] * (axis + 1)
  1893. slicer[axis] = slobj
  1894. slicer = tuple(slicer)
  1895. new_blocks = [blk.getitem_block(slicer) for blk in self.blocks]
  1896. new_axes = list(self.axes)
  1897. new_axes[axis] = new_axes[axis][slobj]
  1898. bm = self.__class__(new_blocks, new_axes, do_integrity_check=False,
  1899. fastpath=True)
  1900. bm._consolidate_inplace()
  1901. return bm
  1902. def __contains__(self, item):
  1903. return item in self.items
  1904. @property
  1905. def nblocks(self):
  1906. return len(self.blocks)
  1907. def copy(self, deep=True):
  1908. """
  1909. Make deep or shallow copy of BlockManager
  1910. Parameters
  1911. ----------
  1912. deep : boolean, default True
  1913. If False, return shallow copy (do not copy data)
  1914. Returns
  1915. -------
  1916. copy : BlockManager
  1917. """
  1918. if deep:
  1919. new_axes = [ax.view() for ax in self.axes]
  1920. else:
  1921. new_axes = list(self.axes)
  1922. return self.apply('copy', axes=new_axes, deep=deep,
  1923. do_integrity_check=False)
  1924. def as_matrix(self, items=None):
  1925. if len(self.blocks) == 0:
  1926. return np.empty(self.shape, dtype=float)
  1927. if items is not None:
  1928. mgr = self.reindex_axis(items, axis=0)
  1929. else:
  1930. mgr = self
  1931. if self._is_single_block:
  1932. return mgr.blocks[0].get_values()
  1933. else:
  1934. return mgr._interleave()
  1935. def _interleave(self):
  1936. """
  1937. Return ndarray from blocks with specified item order
  1938. Items must be contained in the blocks
  1939. """
  1940. dtype = _interleaved_dtype(self.blocks)
  1941. result = np.empty(self.shape, dtype=dtype)
  1942. if result.shape[0] == 0:
  1943. # Workaround for numpy 1.7 bug:
  1944. #
  1945. # >>> a = np.empty((0,10))
  1946. # >>> a[slice(0,0)]
  1947. # array([], shape=(0, 10), dtype=float64)
  1948. # >>> a[[]]
  1949. # Traceback (most recent call last):
  1950. # File "<stdin>", line 1, in <module>
  1951. # IndexError: index 0 is out of bounds for axis 0 with size 0
  1952. return result
  1953. itemmask = np.zeros(self.shape[0])
  1954. for blk in self.blocks:
  1955. rl = blk.mgr_locs
  1956. result[rl.indexer] = blk.get_values(dtype)
  1957. itemmask[rl.indexer] = 1
  1958. if not itemmask.all():
  1959. raise AssertionError('Some items were not contained in blocks')
  1960. return result
  1961. def xs(self, key, axis=1, copy=True, takeable=False):
  1962. if axis < 1:
  1963. raise AssertionError('Can only take xs across axis >= 1, got %d'
  1964. % axis)
  1965. # take by position
  1966. if takeable:
  1967. loc = key
  1968. else:
  1969. loc = self.axes[axis].get_loc(key)
  1970. slicer = [slice(None, None) for _ in range(self.ndim)]
  1971. slicer[axis] = loc
  1972. slicer = tuple(slicer)
  1973. new_axes = list(self.axes)
  1974. # could be an array indexer!
  1975. if isinstance(loc, (slice, np.ndarray)):
  1976. new_axes[axis] = new_axes[axis][loc]
  1977. else:
  1978. new_axes.pop(axis)
  1979. new_blocks = []
  1980. if len(self.blocks) > 1:
  1981. # we must copy here as we are mixed type
  1982. for blk in self.blocks:
  1983. newb = make_block(values=blk.values[slicer],
  1984. klass=blk.__class__, fastpath=True,
  1985. placement=blk.mgr_locs)
  1986. new_blocks.append(newb)
  1987. elif len(self.blocks) == 1:
  1988. block = self.blocks[0]
  1989. vals = block.values[slicer]
  1990. if copy:
  1991. vals = vals.copy()
  1992. new_blocks = [make_block(values=vals, placement=block.mgr_locs,
  1993. klass=block.__class__, fastpath=True,)]
  1994. return self.__class__(new_blocks, new_axes)
  1995. def fast_xs(self, loc):
  1996. """
  1997. get a cross sectional for a given location in the
  1998. items ; handle dups
  1999. return the result, is *could* be a view in the case of a
  2000. single block
  2001. """
  2002. if len(self.blocks) == 1:
  2003. return self.blocks[0].values[:, loc]
  2004. items = self.items
  2005. # non-unique (GH4726)
  2006. if not items.is_unique:
  2007. result = self._interleave()
  2008. if self.ndim == 2:
  2009. result = result.T
  2010. return result[loc]
  2011. # unique
  2012. dtype = _interleaved_dtype(self.blocks)
  2013. n = len(items)
  2014. result = np.empty(n, dtype=dtype)
  2015. for blk in self.blocks:
  2016. # Such assignment may incorrectly coerce NaT to None
  2017. # result[blk.mgr_locs] = blk._slice((slice(None), loc))
  2018. for i, rl in enumerate(blk.mgr_locs):
  2019. result[rl] = blk._try_coerce_result(blk.iget((i, loc)))
  2020. return result
  2021. def consolidate(self):
  2022. """
  2023. Join together blocks having same dtype
  2024. Returns
  2025. -------
  2026. y : BlockManager
  2027. """
  2028. if self.is_consolidated():
  2029. return self
  2030. bm = self.__class__(self.blocks, self.axes)
  2031. bm._consolidate_inplace()
  2032. return bm
  2033. def _consolidate_inplace(self):
  2034. if not self.is_consolidated():
  2035. self.blocks = tuple(_consolidate(self.blocks))
  2036. self._is_consolidated = True
  2037. self._known_consolidated = True
  2038. self._rebuild_blknos_and_blklocs()
  2039. def get(self, item, fastpath=True):
  2040. """
  2041. Return values for selected item (ndarray or BlockManager).
  2042. """
  2043. if self.items.is_unique:
  2044. if not isnull(item):
  2045. loc = self.items.get_loc(item)
  2046. else:
  2047. indexer = np.arange(len(self.items))[isnull(self.items)]
  2048. # allow a single nan location indexer
  2049. if not np.isscalar(indexer):
  2050. if len(indexer) == 1:
  2051. loc = indexer.item()
  2052. else:
  2053. raise ValueError("cannot label index with a null key")
  2054. return self.iget(loc, fastpath=fastpath)
  2055. else:
  2056. if isnull(item):
  2057. raise ValueError("cannot label index with a null key")
  2058. indexer = self.items.get_indexer_for([item])
  2059. return self.reindex_indexer(new_axis=self.items[indexer],
  2060. indexer=indexer, axis=0, allow_dups=True)
  2061. def iget(self, i, fastpath=True):
  2062. """
  2063. Return the data as a SingleBlockManager if fastpath=True and possible
  2064. Otherwise return as a ndarray
  2065. """
  2066. block = self.blocks[self._blknos[i]]
  2067. values = block.iget(self._blklocs[i])
  2068. if not fastpath or block.is_sparse or values.ndim != 1:
  2069. return values
  2070. # fastpath shortcut for select a single-dim from a 2-dim BM
  2071. return SingleBlockManager([ block.make_block_same_class(values,
  2072. placement=slice(0, len(values)),
  2073. fastpath=True) ],
  2074. self.axes[1])
  2075. def get_scalar(self, tup):
  2076. """
  2077. Retrieve single item
  2078. """
  2079. full_loc = list(ax.get_loc(x)
  2080. for ax, x in zip(self.axes, tup))
  2081. blk = self.blocks[self._blknos[full_loc[0]]]
  2082. full_loc[0] = self._blklocs[full_loc[0]]
  2083. # FIXME: this may return non-upcasted types?
  2084. return blk.values[tuple(full_loc)]
  2085. def delete(self, item):
  2086. """
  2087. Delete selected item (items if non-unique) in-place.
  2088. """
  2089. indexer = self.items.get_loc(item)
  2090. is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
  2091. is_deleted[indexer] = True
  2092. ref_loc_offset = -is_deleted.cumsum()
  2093. is_blk_deleted = [False] * len(self.blocks)
  2094. if isinstance(indexer, int):
  2095. affected_start = indexer
  2096. else:
  2097. affected_start = is_deleted.nonzero()[0][0]
  2098. for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]):
  2099. blk = self.blocks[blkno]
  2100. bml = blk.mgr_locs
  2101. blk_del = is_deleted[bml.indexer].nonzero()[0]
  2102. if len(blk_del) == len(bml):
  2103. is_blk_deleted[blkno] = True
  2104. continue
  2105. elif len(blk_del) != 0:
  2106. blk.delete(blk_del)
  2107. bml = blk.mgr_locs
  2108. blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer])
  2109. # FIXME: use Index.delete as soon as it uses fastpath=True
  2110. self.axes[0] = self.items[~is_deleted]
  2111. self.blocks = tuple(b for blkno, b in enumerate(self.blocks)
  2112. if not is_blk_deleted[blkno])
  2113. self._shape = None
  2114. self._rebuild_blknos_and_blklocs()
  2115. def set(self, item, value, check=False):
  2116. """
  2117. Set new item in-place. Does not consolidate. Adds new Block if not
  2118. contained in the current set of items
  2119. if check, then validate that we are not setting the same data in-place
  2120. """
  2121. # FIXME: refactor, clearly separate broadcasting & zip-like assignment
  2122. value_is_sparse = isinstance(value, SparseArray)
  2123. if value_is_sparse:
  2124. assert self.ndim == 2
  2125. def value_getitem(placement):
  2126. return value
  2127. else:
  2128. if value.ndim == self.ndim - 1:
  2129. value = value.reshape((1,) + value.shape)
  2130. def value_getitem(placement):
  2131. return value
  2132. else:
  2133. def value_getitem(placement):
  2134. return value[placement.indexer]
  2135. if value.shape[1:] != self.shape[1:]:
  2136. raise AssertionError('Shape of new values must be compatible '
  2137. 'with manager shape')
  2138. try:
  2139. loc = self.items.get_loc(item)
  2140. except KeyError:
  2141. # This item wasn't present, just insert at end
  2142. self.insert(len(self.items), item, value)
  2143. return
  2144. if isinstance(loc, int):
  2145. loc = [loc]
  2146. blknos = self._blknos[loc]
  2147. blklocs = self._blklocs[loc]
  2148. unfit_mgr_locs = []
  2149. unfit_val_locs = []
  2150. removed_blknos = []
  2151. for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks),
  2152. group=True):
  2153. blk = self.blocks[blkno]
  2154. blk_locs = blklocs[val_locs.indexer]
  2155. if blk.should_store(value):
  2156. blk.set(blk_locs, value_getitem(val_locs), check=check)
  2157. else:
  2158. unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
  2159. unfit_val_locs.append(val_locs)
  2160. # If all block items are unfit, schedule the block for removal.
  2161. if len(val_locs) == len(blk.mgr_locs):
  2162. removed_blknos.append(blkno)
  2163. else:
  2164. self._blklocs[blk.mgr_locs.indexer] = -1
  2165. blk.delete(blk_locs)
  2166. self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk))
  2167. if len(removed_blknos):
  2168. # Remove blocks & update blknos accordingly
  2169. is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
  2170. is_deleted[removed_blknos] = True
  2171. new_blknos = np.empty(self.nblocks, dtype=np.int64)
  2172. new_blknos.fill(-1)
  2173. new_blknos[~is_deleted] = np.arange(self.nblocks -
  2174. len(removed_blknos))
  2175. self._blknos = com.take_1d(new_blknos, self._blknos, axis=0,
  2176. allow_fill=False)
  2177. self.blocks = tuple(blk for i, blk in enumerate(self.blocks)
  2178. if i not in set(removed_blknos))
  2179. if unfit_val_locs:
  2180. unfit_mgr_locs = np.concatenate(unfit_mgr_locs)
  2181. unfit_count = len(unfit_mgr_locs)
  2182. new_blocks = []
  2183. if value_is_sparse:
  2184. # This code (ab-)uses the fact that sparse blocks contain only
  2185. # one item.
  2186. new_blocks.extend(
  2187. make_block(values=value.copy(), ndim=self.ndim,
  2188. placement=slice(mgr_loc, mgr_loc + 1))
  2189. for mgr_loc in unfit_mgr_locs)
  2190. self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) +
  2191. len(self.blocks))
  2192. self._blklocs[unfit_mgr_locs] = 0
  2193. else:
  2194. # unfit_val_locs contains BlockPlacement objects
  2195. unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
  2196. new_blocks.append(
  2197. make_block(values=value_getitem(unfit_val_items),
  2198. ndim=self.ndim, placement=unfit_mgr_locs))
  2199. self._blknos[unfit_mgr_locs] = len(self.blocks)
  2200. self._blklocs[unfit_mgr_locs] = np.arange(unfit_count)
  2201. self.blocks += tuple(new_blocks)
  2202. # Newly created block's dtype may already be present.
  2203. self._known_consolidated = False
  2204. def insert(self, loc, item, value, allow_duplicates=False):
  2205. """
  2206. Insert item at selected position.
  2207. Parameters
  2208. ----------
  2209. loc : int
  2210. item : hashable
  2211. value : array_like
  2212. allow_duplicates: bool
  2213. If False, trying to insert non-unique item will raise
  2214. """
  2215. if not allow_duplicates and item in self.items:
  2216. # Should this be a different kind of error??
  2217. raise ValueError('cannot insert %s, already exists' % item)
  2218. if not isinstance(loc, int):
  2219. raise TypeError("loc must be int")
  2220. block = make_block(values=value,
  2221. ndim=self.ndim,
  2222. placement=slice(loc, loc+1))
  2223. for blkno, count in _fast_count_smallints(self._blknos[loc:]):
  2224. blk = self.blocks[blkno]
  2225. if count == len(blk.mgr_locs):
  2226. blk.mgr_locs = blk.mgr_locs.add(1)
  2227. else:
  2228. new_mgr_locs = blk.mgr_locs.as_array.copy()
  2229. new_mgr_locs[new_mgr_locs >= loc] += 1
  2230. blk.mgr_locs = new_mgr_locs
  2231. if loc == self._blklocs.shape[0]:
  2232. # np.append is a lot faster (at least in numpy 1.7.1), let's use it
  2233. # if we can.
  2234. self._blklocs = np.append(self._blklocs, 0)
  2235. self._blknos = np.append(self._blknos, len(self.blocks))
  2236. else:
  2237. self._blklocs = np.insert(self._blklocs, loc, 0)
  2238. self._blknos = np.insert(self._blknos, loc, len(self.blocks))
  2239. self.axes[0] = self.items.insert(loc, item)
  2240. self.blocks += (block,)
  2241. self._shape = None
  2242. self._known_consolidated = False
  2243. if len(self.blocks) > 100:
  2244. self._consolidate_inplace()
  2245. def reindex_axis(self, new_index, axis, method=None, limit=None,
  2246. fill_value=None, copy=True):
  2247. """
  2248. Conform block manager to new index.
  2249. """
  2250. new_index = _ensure_index(new_index)
  2251. new_index, indexer = self.axes[axis].reindex(
  2252. new_index, method=method, limit=limit, copy_if_needed=True)
  2253. return self.reindex_indexer(new_index, indexer, axis=axis,
  2254. fill_value=fill_value, copy=copy)
  2255. def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
  2256. allow_dups=False, copy=True):
  2257. """
  2258. Parameters
  2259. ----------
  2260. new_axis : Index
  2261. indexer : ndarray of int64 or None
  2262. axis : int
  2263. fill_value : object
  2264. allow_dups : bool
  2265. pandas-indexer with -1's only.
  2266. """
  2267. if indexer is None:
  2268. if new_axis is self.axes[axis] and not copy:
  2269. return self
  2270. result = self.copy(deep=copy)
  2271. result.axes = list(self.axes)
  2272. result.axes[axis] = new_axis
  2273. return result
  2274. self._consolidate_inplace()
  2275. # trying to reindex on an axis with duplicates
  2276. if (not allow_dups and not self.axes[axis].is_unique
  2277. and len(indexer)):
  2278. raise ValueError("cannot reindex from a duplicate axis")
  2279. if axis >= self.ndim:
  2280. raise IndexError("Requested axis not found in manager")
  2281. if axis == 0:
  2282. new_blocks = self._slice_take_blocks_ax0(
  2283. indexer, fill_tuple=(fill_value,))
  2284. else:
  2285. new_blocks = [blk.take_nd(indexer, axis=axis,
  2286. fill_tuple=(fill_value if fill_value is not None else
  2287. blk.fill_value,))
  2288. for blk in self.blocks]
  2289. new_axes = list(self.axes)
  2290. new_axes[axis] = new_axis
  2291. return self.__class__(new_blocks, new_axes)
  2292. def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
  2293. """
  2294. Slice/take blocks along axis=0.
  2295. Overloaded for SingleBlock
  2296. Returns
  2297. -------
  2298. new_blocks : list of Block
  2299. """
  2300. allow_fill = fill_tuple is not None
  2301. sl_type, slobj, sllen = _preprocess_slice_or_indexer(
  2302. slice_or_indexer, self.shape[0], allow_fill=allow_fill)
  2303. if self._is_single_block:
  2304. blk = self.blocks[0]
  2305. if sl_type in ('slice', 'mask'):
  2306. return [blk.getitem_block(slobj,
  2307. new_mgr_locs=slice(0, sllen))]
  2308. elif not allow_fill or self.ndim == 1:
  2309. if allow_fill and fill_tuple[0] is None:
  2310. _, fill_value = com._maybe_promote(blk.dtype)
  2311. fill_tuple = (fill_value,)
  2312. return [blk.take_nd(slobj, axis=0,
  2313. new_mgr_locs=slice(0, sllen),
  2314. fill_tuple=fill_tuple)]
  2315. if sl_type in ('slice', 'mask'):
  2316. blknos = self._blknos[slobj]
  2317. blklocs = self._blklocs[slobj]
  2318. else:
  2319. blknos = com.take_1d(self._blknos, slobj, fill_value=-1,
  2320. allow_fill=allow_fill)
  2321. blklocs = com.take_1d(self._blklocs, slobj, fill_value=-1,
  2322. allow_fill=allow_fill)
  2323. # When filling blknos, make sure blknos is updated before appending to
  2324. # blocks list, that way new blkno is exactly len(blocks).
  2325. #
  2326. # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order,
  2327. # pytables serialization will break otherwise.
  2328. blocks = []
  2329. for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks),
  2330. group=True):
  2331. if blkno == -1:
  2332. # If we've got here, fill_tuple was not None.
  2333. fill_value = fill_tuple[0]
  2334. blocks.append(self._make_na_block(
  2335. placement=mgr_locs, fill_value=fill_value))
  2336. else:
  2337. blk = self.blocks[blkno]
  2338. # Otherwise, slicing along items axis is necessary.
  2339. if blk.is_sparse:
  2340. # A sparse block, it's easy, because there's only one item
  2341. # and each mgr loc is a copy of that single item.
  2342. for mgr_loc in mgr_locs:
  2343. newblk = blk.copy(deep=True)
  2344. newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1)
  2345. blocks.append(newblk)
  2346. else:
  2347. blocks.append(blk.take_nd(
  2348. blklocs[mgr_locs.indexer], axis=0,
  2349. new_mgr_locs=mgr_locs, fill_tuple=None))
  2350. return blocks
  2351. def _make_na_block(self, placement, fill_value=None):
  2352. # TODO: infer dtypes other than float64 from fill_value
  2353. if fill_value is None:
  2354. fill_value = np.nan
  2355. block_shape = list(self.shape)
  2356. block_shape[0] = len(placement)
  2357. dtype, fill_value = com._infer_dtype_from_scalar(fill_value)
  2358. block_values = np.empty(block_shape, dtype=dtype)
  2359. block_values.fill(fill_value)
  2360. return make_block(block_values, placement=placement)
  2361. def take(self, indexer, axis=1, verify=True, convert=True):
  2362. """
  2363. Take items along any axis.
  2364. """
  2365. self._consolidate_inplace()
  2366. indexer = np.asanyarray(indexer, dtype=np.int_)
  2367. n = self.shape[axis]
  2368. if convert:
  2369. indexer = _maybe_convert_indices(indexer, n)
  2370. if verify:
  2371. if ((indexer == -1) | (indexer >= n)).any():
  2372. raise Exception('Indices must be nonzero and less than '
  2373. 'the axis length')
  2374. new_labels = self.axes[axis].take(indexer)
  2375. return self.reindex_indexer(new_axis=new_labels, indexer=indexer,
  2376. axis=axis, allow_dups=True)
  2377. def merge(self, other, lsuffix='', rsuffix=''):
  2378. if not self._is_indexed_like(other):
  2379. raise AssertionError('Must have same axes to merge managers')
  2380. l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix,
  2381. right=other.items, rsuffix=rsuffix)
  2382. new_items = _concat_indexes([l, r])
  2383. new_blocks = [blk.copy(deep=False)
  2384. for blk in self.blocks]
  2385. offset = self.shape[0]
  2386. for blk in other.blocks:
  2387. blk = blk.copy(deep=False)
  2388. blk.mgr_locs = blk.mgr_locs.add(offset)
  2389. new_blocks.append(blk)
  2390. new_axes = list(self.axes)
  2391. new_axes[0] = new_items
  2392. return self.__class__(_consolidate(new_blocks), new_axes)
  2393. def _is_indexed_like(self, other):
  2394. """
  2395. Check all axes except items
  2396. """
  2397. if self.ndim != other.ndim:
  2398. raise AssertionError(('Number of dimensions must agree '
  2399. 'got %d and %d') % (self.ndim, other.ndim))
  2400. for ax, oax in zip(self.axes[1:], other.axes[1:]):
  2401. if not ax.equals(oax):
  2402. return False
  2403. return True
  2404. def equals(self, other):
  2405. self_axes, other_axes = self.axes, other.axes
  2406. if len(self_axes) != len(other_axes):
  2407. return False
  2408. if not all (ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
  2409. return False
  2410. self._consolidate_inplace()
  2411. other._consolidate_inplace()
  2412. return all(block.equals(oblock) for block, oblock in
  2413. zip(self.blocks, other.blocks))
  2414. class SingleBlockManager(BlockManager):
  2415. """ manage a single block with """
  2416. ndim = 1
  2417. _is_consolidated = True
  2418. _known_consolidated = True
  2419. __slots__ = ()
  2420. def __init__(self, block, axis, do_integrity_check=False, fastpath=False):
  2421. if isinstance(axis, list):
  2422. if len(axis) != 1:
  2423. raise ValueError(
  2424. "cannot create SingleBlockManager with more than 1 axis")
  2425. axis = axis[0]
  2426. # passed from constructor, single block, single axis
  2427. if fastpath:
  2428. self.axes = [axis]
  2429. if isinstance(block, list):
  2430. # empty block
  2431. if len(block) == 0:
  2432. block = [np.array([])]
  2433. elif len(block) != 1:
  2434. raise ValueError('Cannot create SingleBlockManager with '
  2435. 'more than 1 block')
  2436. block = block[0]
  2437. else:
  2438. self.axes = [_ensure_index(axis)]
  2439. # create the block here
  2440. if isinstance(block, list):
  2441. # provide consolidation to the interleaved_dtype
  2442. if len(block) > 1:
  2443. dtype = _interleaved_dtype(block)
  2444. block = [b.astype(dtype) for b in block]
  2445. block = _consolidate(block)
  2446. if len(block) != 1:
  2447. raise ValueError('Cannot create SingleBlockManager with '
  2448. 'more than 1 block')
  2449. block = block[0]
  2450. if not isinstance(block, Block):
  2451. block = make_block(block,
  2452. placement=slice(0, len(axis)),
  2453. ndim=1, fastpath=True)
  2454. self.blocks = [block]
  2455. def _post_setstate(self):
  2456. pass
  2457. @property
  2458. def _block(self):
  2459. return self.blocks[0]
  2460. @property
  2461. def _values(self):
  2462. return self._block.values
  2463. def reindex(self, new_axis, indexer=None, method=None, fill_value=None,
  2464. limit=None, copy=True):
  2465. # if we are the same and don't copy, just return
  2466. if self.index.equals(new_axis):
  2467. if copy:
  2468. return self.copy(deep=True)
  2469. else:
  2470. return self
  2471. values = self._block.get_values()
  2472. if indexer is None:
  2473. indexer = self.items.get_indexer_for(new_axis)
  2474. if fill_value is None:
  2475. # FIXME: is fill_value used correctly in sparse blocks?
  2476. if not self._block.is_sparse:
  2477. fill_value = self._block.fill_value
  2478. else:
  2479. fill_value = np.nan
  2480. new_values = com.take_1d(values, indexer,
  2481. fill_value=fill_value)
  2482. # fill if needed
  2483. if method is not None or limit is not None:
  2484. new_values = com.interpolate_2d(new_values, method=method,
  2485. limit=limit, fill_value=fill_value)
  2486. if self._block.is_sparse:
  2487. make_block = self._block.make_block_same_class
  2488. block = make_block(new_values, copy=copy,
  2489. placement=slice(0, len(new_axis)))
  2490. mgr = SingleBlockManager(block, new_axis)
  2491. mgr._consolidate_inplace()
  2492. return mgr
  2493. def get_slice(self, slobj, axis=0):
  2494. if axis >= self.ndim:
  2495. raise IndexError("Requested axis not found in manager")
  2496. return self.__class__(self._block._slice(slobj),
  2497. self.index[slobj], fastpath=True)
  2498. @property
  2499. def index(self):
  2500. return self.axes[0]
  2501. def convert(self, **kwargs):
  2502. """ convert the whole block as one """
  2503. kwargs['by_item'] = False
  2504. return self.apply('convert', **kwargs)
  2505. @property
  2506. def dtype(self):
  2507. return self._values.dtype
  2508. @property
  2509. def ftype(self):
  2510. return self._block.ftype
  2511. def get_dtype_counts(self):
  2512. return {self.dtype.name: 1}
  2513. def get_ftype_counts(self):
  2514. return {self.ftype: 1}
  2515. def get_dtypes(self):
  2516. return np.array([self._block.dtype])
  2517. def get_ftypes(self):
  2518. return np.array([self._block.ftype])
  2519. @property
  2520. def values(self):
  2521. return self._values.view()
  2522. @property
  2523. def itemsize(self):
  2524. return self._values.itemsize
  2525. @property
  2526. def _can_hold_na(self):
  2527. return self._block._can_hold_na
  2528. def is_consolidated(self):
  2529. return True
  2530. def _consolidate_check(self):
  2531. pass
  2532. def _consolidate_inplace(self):
  2533. pass
  2534. def delete(self, item):
  2535. """
  2536. Delete single item from SingleBlockManager.
  2537. Ensures that self.blocks doesn't become empty.
  2538. """
  2539. loc = self.items.get_loc(item)
  2540. self._block.delete(loc)
  2541. self.axes[0] = self.axes[0].delete(loc)
  2542. def fast_xs(self, loc):
  2543. """
  2544. fast path for getting a cross-section
  2545. return a view of the data
  2546. """
  2547. return self._block.values[loc]
  2548. def construction_error(tot_items, block_shape, axes, e=None):
  2549. """ raise a helpful message about our construction """
  2550. passed = tuple(map(int, [tot_items] + list(block_shape)))
  2551. implied = tuple(map(int, [len(ax) for ax in axes]))
  2552. if passed == implied and e is not None:
  2553. raise e
  2554. raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
  2555. passed,implied))
  2556. def create_block_manager_from_blocks(blocks, axes):
  2557. try:
  2558. if len(blocks) == 1 and not isinstance(blocks[0], Block):
  2559. # It's OK if a single block is passed as values, its placement is
  2560. # basically "all items", but if there're many, don't bother
  2561. # converting, it's an error anyway.
  2562. blocks = [make_block(values=blocks[0],
  2563. placement=slice(0, len(axes[0])))]
  2564. mgr = BlockManager(blocks, axes)
  2565. mgr._consolidate_inplace()
  2566. return mgr
  2567. except (ValueError) as e:
  2568. blocks = [getattr(b, 'values', b) for b in blocks]
  2569. tot_items = sum(b.shape[0] for b in blocks)
  2570. construction_error(tot_items, blocks[0].shape[1:], axes, e)
  2571. def create_block_manager_from_arrays(arrays, names, axes):
  2572. try:
  2573. blocks = form_blocks(arrays, names, axes)
  2574. mgr = BlockManager(blocks, axes)
  2575. mgr._consolidate_inplace()
  2576. return mgr
  2577. except (ValueError) as e:
  2578. construction_error(len(arrays), arrays[0].shape[1:], axes, e)
  2579. def form_blocks(arrays, names, axes):
  2580. # put "leftover" items in float bucket, where else?
  2581. # generalize?
  2582. float_items = []
  2583. complex_items = []
  2584. int_items = []
  2585. bool_items = []
  2586. object_items = []
  2587. sparse_items = []
  2588. datetime_items = []
  2589. extra_locs = []
  2590. names_idx = Index(names)
  2591. if names_idx.equals(axes[0]):
  2592. names_indexer = np.arange(len(names_idx))
  2593. else:
  2594. assert names_idx.intersection(axes[0]).is_unique
  2595. names_indexer = names_idx.get_indexer_for(axes[0])
  2596. for i, name_idx in enumerate(names_indexer):
  2597. if name_idx == -1:
  2598. extra_locs.append(i)
  2599. continue
  2600. k = names[name_idx]
  2601. v = arrays[name_idx]
  2602. if isinstance(v, (SparseArray, ABCSparseSeries)):
  2603. sparse_items.append((i, k, v))
  2604. elif issubclass(v.dtype.type, np.floating):
  2605. float_items.append((i, k, v))
  2606. elif issubclass(v.dtype.type, np.complexfloating):
  2607. complex_items.append((i, k, v))
  2608. elif issubclass(v.dtype.type, np.datetime64):
  2609. if v.dtype != _NS_DTYPE:
  2610. v = tslib.cast_to_nanoseconds(v)
  2611. if hasattr(v, 'tz') and v.tz is not None:
  2612. object_items.append((i, k, v))
  2613. else:
  2614. datetime_items.append((i, k, v))
  2615. elif issubclass(v.dtype.type, np.integer):
  2616. if v.dtype == np.uint64:
  2617. # HACK #2355 definite overflow
  2618. if (v > 2 ** 63 - 1).any():
  2619. object_items.append((i, k, v))
  2620. continue
  2621. int_items.append((i, k, v))
  2622. elif v.dtype == np.bool_:
  2623. bool_items.append((i, k, v))
  2624. else:
  2625. object_items.append((i, k, v))
  2626. blocks = []
  2627. if len(float_items):
  2628. float_blocks = _multi_blockify(float_items)
  2629. blocks.extend(float_blocks)
  2630. if len(complex_items):
  2631. complex_blocks = _simple_blockify(
  2632. complex_items, np.complex128)
  2633. blocks.extend(complex_blocks)
  2634. if len(int_items):
  2635. int_blocks = _multi_blockify(int_items)
  2636. blocks.extend(int_blocks)
  2637. if len(datetime_items):
  2638. datetime_blocks = _simple_blockify(
  2639. datetime_items, _NS_DTYPE)
  2640. blocks.extend(datetime_blocks)
  2641. if len(bool_items):
  2642. bool_blocks = _simple_blockify(
  2643. bool_items, np.bool_)
  2644. blocks.extend(bool_blocks)
  2645. if len(object_items) > 0:
  2646. object_blocks = _simple_blockify(
  2647. object_items, np.object_)
  2648. blocks.extend(object_blocks)
  2649. if len(sparse_items) > 0:
  2650. sparse_blocks = _sparse_blockify(sparse_items)
  2651. blocks.extend(sparse_blocks)
  2652. if len(extra_locs):
  2653. shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:])
  2654. # empty items -> dtype object
  2655. block_values = np.empty(shape, dtype=object)
  2656. block_values.fill(np.nan)
  2657. na_block = make_block(block_values, placement=extra_locs)
  2658. blocks.append(na_block)
  2659. return blocks
  2660. def _simple_blockify(tuples, dtype):
  2661. """ return a single array of a block that has a single dtype; if dtype is
  2662. not None, coerce to this dtype
  2663. """
  2664. values, placement = _stack_arrays(tuples, dtype)
  2665. # CHECK DTYPE?
  2666. if dtype is not None and values.dtype != dtype: # pragma: no cover
  2667. values = values.astype(dtype)
  2668. block = make_block(values, placement=placement)
  2669. return [block]
  2670. def _multi_blockify(tuples, dtype=None):
  2671. """ return an array of blocks that potentially have different dtypes """
  2672. # group by dtype
  2673. grouper = itertools.groupby(tuples, lambda x: x[2].dtype)
  2674. new_blocks = []
  2675. for dtype, tup_block in grouper:
  2676. values, placement = _stack_arrays(
  2677. list(tup_block), dtype)
  2678. block = make_block(values, placement=placement)
  2679. new_blocks.append(block)
  2680. return new_blocks
  2681. def _sparse_blockify(tuples, dtype=None):
  2682. """ return an array of blocks that potentially have different dtypes (and
  2683. are sparse)
  2684. """
  2685. new_blocks = []
  2686. for i, names, array in tuples:
  2687. array = _maybe_to_sparse(array)
  2688. block = make_block(
  2689. array, klass=SparseBlock, fastpath=True,
  2690. placement=[i])
  2691. new_blocks.append(block)
  2692. return new_blocks
  2693. def _stack_arrays(tuples, dtype):
  2694. # fml
  2695. def _asarray_compat(x):
  2696. if isinstance(x, ABCSeries):
  2697. return x.values
  2698. else:
  2699. return np.asarray(x)
  2700. def _shape_compat(x):
  2701. if isinstance(x, ABCSeries):
  2702. return len(x),
  2703. else:
  2704. return x.shape
  2705. placement, names, arrays = zip(*tuples)
  2706. first = arrays[0]
  2707. shape = (len(arrays),) + _shape_compat(first)
  2708. stacked = np.empty(shape, dtype=dtype)
  2709. for i, arr in enumerate(arrays):
  2710. stacked[i] = _asarray_compat(arr)
  2711. return stacked, placement
  2712. def _interleaved_dtype(blocks):
  2713. if not len(blocks):
  2714. return None
  2715. counts = defaultdict(lambda: [])
  2716. for x in blocks:
  2717. counts[type(x)].append(x)
  2718. def _lcd_dtype(l):
  2719. """ find the lowest dtype that can accomodate the given types """
  2720. m = l[0].dtype
  2721. for x in l[1:]:
  2722. if x.dtype.itemsize > m.itemsize:
  2723. m = x.dtype
  2724. return m
  2725. have_int = len(counts[IntBlock]) > 0
  2726. have_bool = len(counts[BoolBlock]) > 0
  2727. have_object = len(counts[ObjectBlock]) > 0
  2728. have_float = len(counts[FloatBlock]) > 0
  2729. have_complex = len(counts[ComplexBlock]) > 0
  2730. have_dt64 = len(counts[DatetimeBlock]) > 0
  2731. have_td64 = len(counts[TimeDeltaBlock]) > 0
  2732. have_sparse = len(counts[SparseBlock]) > 0
  2733. have_numeric = have_float or have_complex or have_int
  2734. if (have_object or
  2735. (have_bool and have_numeric) or
  2736. (have_numeric and (have_dt64 or have_td64))):
  2737. return np.dtype(object)
  2738. elif have_bool:
  2739. return np.dtype(bool)
  2740. elif have_int and not have_float and not have_complex:
  2741. # if we are mixing unsigned and signed, then return
  2742. # the next biggest int type (if we can)
  2743. lcd = _lcd_dtype(counts[IntBlock])
  2744. kinds = set([i.dtype.kind for i in counts[IntBlock]])
  2745. if len(kinds) == 1:
  2746. return lcd
  2747. if lcd == 'uint64' or lcd == 'int64':
  2748. return np.dtype('int64')
  2749. # return 1 bigger on the itemsize if unsinged
  2750. if lcd.kind == 'u':
  2751. return np.dtype('int%s' % (lcd.itemsize * 8 * 2))
  2752. return lcd
  2753. elif have_dt64 and not have_float and not have_complex:
  2754. return np.dtype('M8[ns]')
  2755. elif have_td64 and not have_float and not have_complex:
  2756. return np.dtype('m8[ns]')
  2757. elif have_complex:
  2758. return np.dtype('c16')
  2759. else:
  2760. return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock])
  2761. def _consolidate(blocks):
  2762. """
  2763. Merge blocks having same dtype, exclude non-consolidating blocks
  2764. """
  2765. # sort by _can_consolidate, dtype
  2766. gkey = lambda x: x._consolidate_key
  2767. grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
  2768. new_blocks = []
  2769. for (_can_consolidate, dtype), group_blocks in grouper:
  2770. merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype,
  2771. _can_consolidate=_can_consolidate)
  2772. if isinstance(merged_blocks, list):
  2773. new_blocks.extend(merged_blocks)
  2774. else:
  2775. new_blocks.append(merged_blocks)
  2776. return new_blocks
  2777. def _merge_blocks(blocks, dtype=None, _can_consolidate=True):
  2778. if len(blocks) == 1:
  2779. return blocks[0]
  2780. if _can_consolidate:
  2781. if dtype is None:
  2782. if len(set([b.dtype for b in blocks])) != 1:
  2783. raise AssertionError("_merge_blocks are invalid!")
  2784. dtype = blocks[0].dtype
  2785. # FIXME: optimization potential in case all mgrs contain slices and
  2786. # combination of those slices is a slice, too.
  2787. new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
  2788. new_values = _vstack([b.values for b in blocks], dtype)
  2789. argsort = np.argsort(new_mgr_locs)
  2790. new_values = new_values[argsort]
  2791. new_mgr_locs = new_mgr_locs[argsort]
  2792. return make_block(new_values,
  2793. fastpath=True, placement=new_mgr_locs)
  2794. # no merge
  2795. return blocks
  2796. def _block_shape(values, ndim=1, shape=None):
  2797. """ guarantee the shape of the values to be at least 1 d """
  2798. if values.ndim <= ndim:
  2799. if shape is None:
  2800. shape = values.shape
  2801. values = values.reshape(tuple((1,) + shape))
  2802. return values
  2803. def _vstack(to_stack, dtype):
  2804. # work around NumPy 1.6 bug
  2805. if dtype == _NS_DTYPE or dtype == _TD_DTYPE:
  2806. new_values = np.vstack([x.view('i8') for x in to_stack])
  2807. return new_values.view(dtype)
  2808. else:
  2809. return np.vstack(to_stack)
  2810. def _possibly_compare(a, b, op):
  2811. res = op(a, b)
  2812. is_a_array = isinstance(a, np.ndarray)
  2813. is_b_array = isinstance(b, np.ndarray)
  2814. if np.isscalar(res) and (is_a_array or is_b_array):
  2815. type_names = [type(a).__name__, type(b).__name__]
  2816. if is_a_array:
  2817. type_names[0] = 'ndarray(dtype=%s)' % a.dtype
  2818. if is_b_array:
  2819. type_names[1] = 'ndarray(dtype=%s)' % b.dtype
  2820. raise TypeError("Cannot compare types %r and %r" % tuple(type_names))
  2821. return res
  2822. def _concat_indexes(indexes):
  2823. return indexes[0].append(indexes[1:])
  2824. def _get_blkno_placements(blknos, blk_count, group=True):
  2825. """
  2826. Parameters
  2827. ----------
  2828. blknos : array of int64
  2829. blk_count : int
  2830. group : bool
  2831. Returns
  2832. -------
  2833. iterator
  2834. yield (BlockPlacement, blkno)
  2835. """
  2836. blknos = com._ensure_int64(blknos)
  2837. # FIXME: blk_count is unused, but it may avoid the use of dicts in cython
  2838. for blkno, indexer in lib.get_blkno_indexers(blknos, group):
  2839. yield blkno, BlockPlacement(indexer)
  2840. def items_overlap_with_suffix(left, lsuffix, right, rsuffix):
  2841. """
  2842. If two indices overlap, add suffixes to overlapping entries.
  2843. If corresponding suffix is empty, the entry is simply converted to string.
  2844. """
  2845. to_rename = left.intersection(right)
  2846. if len(to_rename) == 0:
  2847. return left, right
  2848. else:
  2849. if not lsuffix and not rsuffix:
  2850. raise ValueError('columns overlap but no suffix specified: %s' %
  2851. to_rename)
  2852. def lrenamer(x):
  2853. if x in to_rename:
  2854. return '%s%s' % (x, lsuffix)
  2855. return x
  2856. def rrenamer(x):
  2857. if x in to_rename:
  2858. return '%s%s' % (x, rsuffix)
  2859. return x
  2860. return (_transform_index(left, lrenamer),
  2861. _transform_index(right, rrenamer))
  2862. def _transform_index(index, func):
  2863. """
  2864. Apply function to all values found in index.
  2865. This includes transforming multiindex entries separately.
  2866. """
  2867. if isinstance(index, MultiIndex):
  2868. items = [tuple(func(y) for y in x) for x in index]
  2869. return MultiIndex.from_tuples(items, names=index.names)
  2870. else:
  2871. items = [func(x) for x in index]
  2872. return Index(items, name=index.name)
  2873. def _putmask_smart(v, m, n):
  2874. """
  2875. Return a new block, try to preserve dtype if possible.
  2876. Parameters
  2877. ----------
  2878. v : array_like
  2879. m : array_like
  2880. n : array_like
  2881. """
  2882. # n should be the length of the mask or a scalar here
  2883. if not is_list_like(n):
  2884. n = np.array([n] * len(m))
  2885. # see if we are only masking values that if putted
  2886. # will work in the current dtype
  2887. try:
  2888. nn = n[m]
  2889. nn_at = nn.astype(v.dtype)
  2890. if (nn == nn_at).all():
  2891. nv = v.copy()
  2892. nv[m] = nn_at
  2893. return nv
  2894. except (ValueError, IndexError, TypeError):
  2895. pass
  2896. # change the dtype
  2897. dtype, _ = com._maybe_promote(n.dtype)
  2898. nv = v.astype(dtype)
  2899. try:
  2900. nv[m] = n
  2901. except ValueError:
  2902. idx, = np.where(np.squeeze(m))
  2903. for mask_index, new_val in zip(idx, n):
  2904. nv[mask_index] = new_val
  2905. return nv
  2906. def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
  2907. """
  2908. Concatenate block managers into one.
  2909. Parameters
  2910. ----------
  2911. mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
  2912. axes : list of Index
  2913. concat_axis : int
  2914. copy : bool
  2915. """
  2916. concat_plan = combine_concat_plans([get_mgr_concatenation_plan(mgr, indexers)
  2917. for mgr, indexers in mgrs_indexers],
  2918. concat_axis)
  2919. blocks = [make_block(concatenate_join_units(join_units, concat_axis,
  2920. copy=copy),
  2921. placement=placement)
  2922. for placement, join_units in concat_plan]
  2923. return BlockManager(blocks, axes)
  2924. def get_empty_dtype_and_na(join_units):
  2925. """
  2926. Return dtype and N/A values to use when concatenating specified units.
  2927. Returned N/A value may be None which means there was no casting involved.
  2928. Returns
  2929. -------
  2930. dtype
  2931. na
  2932. """
  2933. if len(join_units) == 1:
  2934. blk = join_units[0].block
  2935. if blk is None:
  2936. return np.float64, np.nan
  2937. else:
  2938. return blk.dtype, None
  2939. has_none_blocks = False
  2940. dtypes = [None] * len(join_units)
  2941. for i, unit in enumerate(join_units):
  2942. if unit.block is None:
  2943. has_none_blocks = True
  2944. else:
  2945. dtypes[i] = unit.dtype
  2946. if not has_none_blocks and len(set(dtypes)) == 1:
  2947. # Unanimous decision, nothing to upcast.
  2948. return dtypes[0], None
  2949. # dtypes = set()
  2950. upcast_classes = set()
  2951. null_upcast_classes = set()
  2952. for dtype, unit in zip(dtypes, join_units):
  2953. if dtype is None:
  2954. continue
  2955. if issubclass(dtype.type, (np.object_, np.bool_)):
  2956. upcast_cls = 'object'
  2957. elif is_datetime64_dtype(dtype):
  2958. upcast_cls = 'datetime'
  2959. elif is_timedelta64_dtype(dtype):
  2960. upcast_cls = 'timedelta'
  2961. else:
  2962. upcast_cls = 'float'
  2963. # Null blocks should not influence upcast class selection, unless there
  2964. # are only null blocks, when same upcasting rules must be applied to
  2965. # null upcast classes.
  2966. if unit.is_null:
  2967. null_upcast_classes.add(upcast_cls)
  2968. else:
  2969. upcast_classes.add(upcast_cls)
  2970. if not upcast_classes:
  2971. upcast_classes = null_upcast_classes
  2972. # create the result
  2973. if 'object' in upcast_classes:
  2974. return np.dtype(np.object_), np.nan
  2975. elif 'float' in upcast_classes:
  2976. return np.dtype(np.float64), np.nan
  2977. elif 'datetime' in upcast_classes:
  2978. return np.dtype('M8[ns]'), tslib.iNaT
  2979. elif 'timedelta' in upcast_classes:
  2980. return np.dtype('m8[ns]'), tslib.iNaT
  2981. else: # pragma
  2982. raise AssertionError("invalid dtype determination in get_concat_dtype")
  2983. def concatenate_join_units(join_units, concat_axis, copy):
  2984. """
  2985. Concatenate values from several join units along selected axis.
  2986. """
  2987. if concat_axis == 0 and len(join_units) > 1:
  2988. # Concatenating join units along ax0 is handled in _merge_blocks.
  2989. raise AssertionError("Concatenating join units along axis0")
  2990. empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units)
  2991. to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype,
  2992. upcasted_na=upcasted_na)
  2993. for ju in join_units]
  2994. if len(to_concat) == 1:
  2995. # Only one block, nothing to concatenate.
  2996. concat_values = to_concat[0]
  2997. if copy and concat_values.base is not None:
  2998. concat_values = concat_values.copy()
  2999. else:
  3000. concat_values = com._concat_compat(to_concat, axis=concat_axis)
  3001. # FIXME: optimization potential: if len(join_units) == 1, single join unit
  3002. # is densified and sparsified back.
  3003. if any(unit.is_sparse for unit in join_units):
  3004. # If one of the units was sparse, concat_values are 2d and there's only
  3005. # one item.
  3006. return SparseArray(concat_values[0])
  3007. else:
  3008. return concat_values
  3009. def get_mgr_concatenation_plan(mgr, indexers):
  3010. """
  3011. Construct concatenation plan for given block manager and indexers.
  3012. Parameters
  3013. ----------
  3014. mgr : BlockManager
  3015. indexers : dict of {axis: indexer}
  3016. Returns
  3017. -------
  3018. plan : list of (BlockPlacement, JoinUnit) tuples
  3019. """
  3020. # Calculate post-reindex shape , save for item axis which will be separate
  3021. # for each block anyway.
  3022. mgr_shape = list(mgr.shape)
  3023. for ax, indexer in indexers.items():
  3024. mgr_shape[ax] = len(indexer)
  3025. mgr_shape = tuple(mgr_shape)
  3026. if 0 in indexers:
  3027. ax0_indexer = indexers.pop(0)
  3028. blknos = com.take_1d(mgr._blknos, ax0_indexer, fill_value=-1)
  3029. blklocs = com.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1)
  3030. else:
  3031. if mgr._is_single_block:
  3032. blk = mgr.blocks[0]
  3033. return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
  3034. ax0_indexer = None
  3035. blknos = mgr._blknos
  3036. blklocs = mgr._blklocs
  3037. plan = []
  3038. for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks),
  3039. group=False):
  3040. assert placements.is_slice_like
  3041. join_unit_indexers = indexers.copy()
  3042. shape = list(mgr_shape)
  3043. shape[0] = len(placements)
  3044. shape = tuple(shape)
  3045. if blkno == -1:
  3046. unit = JoinUnit(None, shape)
  3047. else:
  3048. blk = mgr.blocks[blkno]
  3049. ax0_blk_indexer = blklocs[placements.indexer]
  3050. unit_no_ax0_reindexing = (
  3051. len(placements) == len(blk.mgr_locs) and
  3052. # Fastpath detection of join unit not needing to reindex its
  3053. # block: no ax0 reindexing took place and block placement was
  3054. # sequential before.
  3055. ((ax0_indexer is None
  3056. and blk.mgr_locs.is_slice_like
  3057. and blk.mgr_locs.as_slice.step == 1) or
  3058. # Slow-ish detection: all indexer locs are sequential (and
  3059. # length match is checked above).
  3060. (np.diff(ax0_blk_indexer) == 1).all()))
  3061. # Omit indexer if no item reindexing is required.
  3062. if unit_no_ax0_reindexing:
  3063. join_unit_indexers.pop(0, None)
  3064. else:
  3065. join_unit_indexers[0] = ax0_blk_indexer
  3066. unit = JoinUnit(blk, shape, join_unit_indexers)
  3067. plan.append((placements, unit))
  3068. return plan
  3069. def combine_concat_plans(plans, concat_axis):
  3070. """
  3071. Combine multiple concatenation plans into one.
  3072. existing_plan is updated in-place.
  3073. """
  3074. if len(plans) == 1:
  3075. for p in plans[0]:
  3076. yield p[0], [p[1]]
  3077. elif concat_axis == 0:
  3078. offset = 0
  3079. for plan in plans:
  3080. last_plc = None
  3081. for plc, unit in plan:
  3082. yield plc.add(offset), [unit]
  3083. last_plc = plc
  3084. if last_plc is not None:
  3085. offset += last_plc.as_slice.stop
  3086. else:
  3087. num_ended = [0]
  3088. def _next_or_none(seq):
  3089. retval = next(seq, None)
  3090. if retval is None:
  3091. num_ended[0] += 1
  3092. return retval
  3093. plans = list(map(iter, plans))
  3094. next_items = list(map(_next_or_none, plans))
  3095. while num_ended[0] != len(next_items):
  3096. if num_ended[0] > 0:
  3097. raise ValueError("Plan shapes are not aligned")
  3098. placements, units = zip(*next_items)
  3099. lengths = list(map(len, placements))
  3100. min_len, max_len = min(lengths), max(lengths)
  3101. if min_len == max_len:
  3102. yield placements[0], units
  3103. next_items[:] = map(_next_or_none, plans)
  3104. else:
  3105. yielded_placement = None
  3106. yielded_units = [None] * len(next_items)
  3107. for i, (plc, unit) in enumerate(next_items):
  3108. yielded_units[i] = unit
  3109. if len(plc) > min_len:
  3110. # trim_join_unit updates unit in place, so only
  3111. # placement needs to be sliced to skip min_len.
  3112. next_items[i] = (plc[min_len:],
  3113. trim_join_unit(unit, min_len))
  3114. else:
  3115. yielded_placement = plc
  3116. next_items[i] = _next_or_none(plans[i])
  3117. yield yielded_placement, yielded_units
  3118. def trim_join_unit(join_unit, length):
  3119. """
  3120. Reduce join_unit's shape along item axis to length.
  3121. Extra items that didn't fit are returned as a separate block.
  3122. """
  3123. if 0 not in join_unit.indexers:
  3124. extra_indexers = join_unit.indexers
  3125. if join_unit.block is None:
  3126. extra_block = None
  3127. else:
  3128. extra_block = join_unit.block.getitem_block(slice(length, None))
  3129. join_unit.block = join_unit.block.getitem_block(slice(length))
  3130. else:
  3131. extra_block = join_unit.block
  3132. extra_indexers = copy.copy(join_unit.indexers)
  3133. extra_indexers[0] = extra_indexers[0][length:]
  3134. join_unit.indexers[0] = join_unit.indexers[0][:length]
  3135. extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
  3136. join_unit.shape = (length,) + join_unit.shape[1:]
  3137. return JoinUnit(block=extra_block, indexers=extra_indexers,
  3138. shape=extra_shape)
  3139. class JoinUnit(object):
  3140. def __init__(self, block, shape, indexers={}):
  3141. # Passing shape explicitly is required for cases when block is None.
  3142. self.block = block
  3143. self.indexers = indexers
  3144. self.shape = shape
  3145. def __repr__(self):
  3146. return '%s(%r, %s)' % (self.__class__.__name__,
  3147. self.block, self.indexers)
  3148. @cache_readonly
  3149. def needs_filling(self):
  3150. for indexer in self.indexers.values():
  3151. # FIXME: cache results of indexer == -1 checks.
  3152. if (indexer == -1).any():
  3153. return True
  3154. return False
  3155. @cache_readonly
  3156. def dtype(self):
  3157. if self.block is None:
  3158. raise AssertionError("Block is None, no dtype")
  3159. if not self.needs_filling:
  3160. return self.block.dtype
  3161. else:
  3162. return np.dtype(com._maybe_promote(self.block.dtype,
  3163. self.block.fill_value)[0])
  3164. return self._dtype
  3165. @cache_readonly
  3166. def is_null(self):
  3167. if self.block is None:
  3168. return True
  3169. if not self.block._can_hold_na:
  3170. return False
  3171. # Usually it's enough to check but a small fraction of values to see if
  3172. # a block is NOT null, chunks should help in such cases. 1000 value
  3173. # was chosen rather arbitrarily.
  3174. values_flat = self.block.values.ravel()
  3175. total_len = values_flat.shape[0]
  3176. chunk_len = max(total_len // 40, 1000)
  3177. for i in range(0, total_len, chunk_len):
  3178. if not isnull(values_flat[i: i + chunk_len]).all():
  3179. return False
  3180. return True
  3181. @cache_readonly
  3182. def is_sparse(self):
  3183. return self.block is not None and self.block.is_sparse
  3184. def get_reindexed_values(self, empty_dtype, upcasted_na):
  3185. if upcasted_na is None:
  3186. # No upcasting is necessary
  3187. fill_value = self.block.fill_value
  3188. values = self.block.get_values()
  3189. else:
  3190. fill_value = upcasted_na
  3191. if self.is_null:
  3192. missing_arr = np.empty(self.shape, dtype=empty_dtype)
  3193. if np.prod(self.shape):
  3194. # NumPy 1.6 workaround: this statement gets strange if all
  3195. # blocks are of same dtype and some of them are empty:
  3196. # empty one are considered "null" so they must be filled,
  3197. # but no dtype upcasting happens and the dtype may not
  3198. # allow NaNs.
  3199. #
  3200. # In general, no one should get hurt when one tries to put
  3201. # incorrect values into empty array, but numpy 1.6 is
  3202. # strict about that.
  3203. missing_arr.fill(fill_value)
  3204. return missing_arr
  3205. if self.block.is_bool:
  3206. # External code requested filling/upcasting, bool values must
  3207. # be upcasted to object to avoid being upcasted to numeric.
  3208. values = self.block.astype(np.object_).values
  3209. else:
  3210. # No dtype upcasting is done here, it will be performed during
  3211. # concatenation itself.
  3212. values = self.block.get_values()
  3213. if not self.indexers:
  3214. # If there's no indexing to be done, we want to signal outside
  3215. # code that this array must be copied explicitly. This is done
  3216. # by returning a view and checking `retval.base`.
  3217. return values.view()
  3218. else:
  3219. for ax, indexer in self.indexers.items():
  3220. values = com.take_nd(values, indexer, axis=ax,
  3221. fill_value=fill_value)
  3222. return values
  3223. def _fast_count_smallints(arr):
  3224. """Faster version of set(arr) for sequences of small numbers."""
  3225. if len(arr) == 0:
  3226. # Handle empty arr case separately: numpy 1.6 chokes on that.
  3227. return np.empty((0, 2), dtype=arr.dtype)
  3228. else:
  3229. counts = np.bincount(arr.astype(np.int_))
  3230. nz = counts.nonzero()[0]
  3231. return np.c_[nz, counts[nz]]
  3232. def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill):
  3233. if isinstance(slice_or_indexer, slice):
  3234. return 'slice', slice_or_indexer, lib.slice_len(slice_or_indexer,
  3235. length)
  3236. elif (isinstance(slice_or_indexer, np.ndarray) and
  3237. slice_or_indexer.dtype == np.bool_):
  3238. return 'mask', slice_or_indexer, slice_or_indexer.sum()
  3239. else:
  3240. indexer = np.asanyarray(slice_or_indexer, dtype=np.int64)
  3241. if not allow_fill:
  3242. indexer = _maybe_convert_indices(indexer, length)
  3243. return 'fancy', indexer, len(indexer)