PageRenderTime 74ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/core/internals.py

http://github.com/pydata/pandas
Python | 4090 lines | 3440 code | 325 blank | 325 comment | 311 complexity | e9e7fceb6f578a654b22f12a998666af MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import copy
  2. import itertools
  3. import re
  4. import operator
  5. from datetime import datetime, timedelta
  6. from collections import defaultdict
  7. import numpy as np
  8. from pandas.core.base import PandasObject
  9. from pandas.core.common import (_possibly_downcast_to_dtype, isnull,
  10. _NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like,
  11. ABCSparseSeries, _infer_dtype_from_scalar,
  12. _is_null_datelike_scalar,
  13. is_timedelta64_dtype, is_datetime64_dtype,
  14. _possibly_infer_to_datetimelike)
  15. from pandas.core.index import Index, MultiIndex, _ensure_index
  16. from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer)
  17. import pandas.core.common as com
  18. from pandas.sparse.array import _maybe_to_sparse, SparseArray
  19. import pandas.lib as lib
  20. import pandas.tslib as tslib
  21. import pandas.computation.expressions as expressions
  22. from pandas.util.decorators import cache_readonly
  23. from pandas.tslib import Timestamp
  24. from pandas import compat
  25. from pandas.compat import range, map, zip, u
  26. from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type
  27. from pandas.lib import BlockPlacement
  28. class Block(PandasObject):
  29. """
  30. Canonical n-dimensional unit of homogeneous dtype contained in a pandas
  31. data structure
  32. Index-ignorant; let the container take care of that
  33. """
  34. __slots__ = ['_mgr_locs', 'values', 'ndim']
  35. is_numeric = False
  36. is_float = False
  37. is_integer = False
  38. is_complex = False
  39. is_datetime = False
  40. is_timedelta = False
  41. is_bool = False
  42. is_object = False
  43. is_sparse = False
  44. _can_hold_na = False
  45. _downcast_dtype = None
  46. _can_consolidate = True
  47. _verify_integrity = True
  48. _ftype = 'dense'
  49. def __init__(self, values, placement, ndim=None, fastpath=False):
  50. if ndim is None:
  51. ndim = values.ndim
  52. elif values.ndim != ndim:
  53. raise ValueError('Wrong number of dimensions')
  54. self.ndim = ndim
  55. self.mgr_locs = placement
  56. self.values = values
  57. if len(self.mgr_locs) != len(self.values):
  58. raise ValueError('Wrong number of items passed %d,'
  59. ' placement implies %d' % (
  60. len(self.values), len(self.mgr_locs)))
  61. @property
  62. def _consolidate_key(self):
  63. return (self._can_consolidate, self.dtype.name)
  64. @property
  65. def _is_single_block(self):
  66. return self.ndim == 1
  67. @property
  68. def is_datelike(self):
  69. """ return True if I am a non-datelike """
  70. return self.is_datetime or self.is_timedelta
  71. @property
  72. def fill_value(self):
  73. return np.nan
  74. @property
  75. def mgr_locs(self):
  76. return self._mgr_locs
  77. def make_block_same_class(self, values, placement, copy=False,
  78. **kwargs):
  79. """
  80. Wrap given values in a block of same type as self.
  81. `kwargs` are used in SparseBlock override.
  82. """
  83. if copy:
  84. values = values.copy()
  85. return make_block(values, placement, klass=self.__class__,
  86. fastpath=True)
  87. @mgr_locs.setter
  88. def mgr_locs(self, new_mgr_locs):
  89. if not isinstance(new_mgr_locs, BlockPlacement):
  90. new_mgr_locs = BlockPlacement(new_mgr_locs)
  91. self._mgr_locs = new_mgr_locs
  92. def __unicode__(self):
  93. # don't want to print out all of the items here
  94. name = com.pprint_thing(self.__class__.__name__)
  95. if self._is_single_block:
  96. result = '%s: %s dtype: %s' % (
  97. name, len(self), self.dtype)
  98. else:
  99. shape = ' x '.join([com.pprint_thing(s) for s in self.shape])
  100. result = '%s: %s, %s, dtype: %s' % (
  101. name, com.pprint_thing(self.mgr_locs.indexer), shape,
  102. self.dtype)
  103. return result
  104. def __len__(self):
  105. return len(self.values)
  106. def __getstate__(self):
  107. return self.mgr_locs.indexer, self.values
  108. def __setstate__(self, state):
  109. self.mgr_locs = BlockPlacement(state[0])
  110. self.values = state[1]
  111. self.ndim = self.values.ndim
  112. def _slice(self, slicer):
  113. """ return a slice of my values """
  114. return self.values[slicer]
  115. def getitem_block(self, slicer, new_mgr_locs=None):
  116. """
  117. Perform __getitem__-like, return result as block.
  118. As of now, only supports slices that preserve dimensionality.
  119. """
  120. if new_mgr_locs is None:
  121. if isinstance(slicer, tuple):
  122. axis0_slicer = slicer[0]
  123. else:
  124. axis0_slicer = slicer
  125. new_mgr_locs = self.mgr_locs[axis0_slicer]
  126. new_values = self._slice(slicer)
  127. if new_values.ndim != self.ndim:
  128. raise ValueError("Only same dim slicing is allowed")
  129. return self.make_block_same_class(new_values, new_mgr_locs)
  130. @property
  131. def shape(self):
  132. return self.values.shape
  133. @property
  134. def itemsize(self):
  135. return self.values.itemsize
  136. @property
  137. def dtype(self):
  138. return self.values.dtype
  139. @property
  140. def ftype(self):
  141. return "%s:%s" % (self.dtype, self._ftype)
  142. def merge(self, other):
  143. return _merge_blocks([self, other])
  144. def reindex_axis(self, indexer, method=None, axis=1, fill_value=None,
  145. limit=None, mask_info=None):
  146. """
  147. Reindex using pre-computed indexer information
  148. """
  149. if axis < 1:
  150. raise AssertionError('axis must be at least 1, got %d' % axis)
  151. if fill_value is None:
  152. fill_value = self.fill_value
  153. new_values = com.take_nd(self.values, indexer, axis,
  154. fill_value=fill_value, mask_info=mask_info)
  155. return make_block(new_values,
  156. ndim=self.ndim, fastpath=True,
  157. placement=self.mgr_locs)
  158. def get(self, item):
  159. loc = self.items.get_loc(item)
  160. return self.values[loc]
  161. def iget(self, i):
  162. return self.values[i]
  163. def set(self, locs, values, check=False):
  164. """
  165. Modify Block in-place with new item value
  166. Returns
  167. -------
  168. None
  169. """
  170. self.values[locs] = values
  171. def delete(self, loc):
  172. """
  173. Delete given loc(-s) from block in-place.
  174. """
  175. self.values = np.delete(self.values, loc, 0)
  176. self.mgr_locs = self.mgr_locs.delete(loc)
  177. def apply(self, func, **kwargs):
  178. """ apply the function to my values; return a block if we are not one """
  179. result = func(self.values)
  180. if not isinstance(result, Block):
  181. result = make_block(values=result, placement=self.mgr_locs,)
  182. return result
  183. def fillna(self, value, limit=None, inplace=False, downcast=None):
  184. if not self._can_hold_na:
  185. if inplace:
  186. return [self]
  187. else:
  188. return [self.copy()]
  189. mask = isnull(self.values)
  190. if limit is not None:
  191. if self.ndim > 2:
  192. raise NotImplementedError
  193. mask[mask.cumsum(self.ndim-1)>limit]=False
  194. value = self._try_fill(value)
  195. blocks = self.putmask(mask, value, inplace=inplace)
  196. return self._maybe_downcast(blocks, downcast)
  197. def _maybe_downcast(self, blocks, downcast=None):
  198. # no need to downcast our float
  199. # unless indicated
  200. if downcast is None and self.is_float:
  201. return blocks
  202. elif downcast is None and (self.is_timedelta or self.is_datetime):
  203. return blocks
  204. result_blocks = []
  205. for b in blocks:
  206. result_blocks.extend(b.downcast(downcast))
  207. return result_blocks
  208. def downcast(self, dtypes=None):
  209. """ try to downcast each item to the dict of dtypes if present """
  210. # turn it off completely
  211. if dtypes is False:
  212. return [self]
  213. values = self.values
  214. # single block handling
  215. if self._is_single_block:
  216. # try to cast all non-floats here
  217. if dtypes is None:
  218. dtypes = 'infer'
  219. nv = _possibly_downcast_to_dtype(values, dtypes)
  220. return [make_block(nv, ndim=self.ndim,
  221. fastpath=True, placement=self.mgr_locs)]
  222. # ndim > 1
  223. if dtypes is None:
  224. return [self]
  225. if not (dtypes == 'infer' or isinstance(dtypes, dict)):
  226. raise ValueError("downcast must have a dictionary or 'infer' as "
  227. "its argument")
  228. # item-by-item
  229. # this is expensive as it splits the blocks items-by-item
  230. blocks = []
  231. for i, rl in enumerate(self.mgr_locs):
  232. if dtypes == 'infer':
  233. dtype = 'infer'
  234. else:
  235. raise AssertionError("dtypes as dict is not supported yet")
  236. dtype = dtypes.get(item, self._downcast_dtype)
  237. if dtype is None:
  238. nv = _block_shape(values[i], ndim=self.ndim)
  239. else:
  240. nv = _possibly_downcast_to_dtype(values[i], dtype)
  241. nv = _block_shape(nv, ndim=self.ndim)
  242. blocks.append(make_block(nv,
  243. ndim=self.ndim, fastpath=True,
  244. placement=[rl]))
  245. return blocks
  246. def astype(self, dtype, copy=False, raise_on_error=True, values=None):
  247. return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
  248. values=values)
  249. def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
  250. klass=None):
  251. """
  252. Coerce to the new type (if copy=True, return a new copy)
  253. raise on an except if raise == True
  254. """
  255. dtype = np.dtype(dtype)
  256. if self.dtype == dtype:
  257. if copy:
  258. return self.copy()
  259. return self
  260. try:
  261. # force the copy here
  262. if values is None:
  263. # _astype_nansafe works fine with 1-d only
  264. values = com._astype_nansafe(self.values.ravel(), dtype, copy=True)
  265. values = values.reshape(self.values.shape)
  266. newb = make_block(values,
  267. ndim=self.ndim, placement=self.mgr_locs,
  268. fastpath=True, dtype=dtype, klass=klass)
  269. except:
  270. if raise_on_error is True:
  271. raise
  272. newb = self.copy() if copy else self
  273. if newb.is_numeric and self.is_numeric:
  274. if newb.shape != self.shape:
  275. raise TypeError("cannot set astype for copy = [%s] for dtype "
  276. "(%s [%s]) with smaller itemsize that current "
  277. "(%s [%s])" % (copy, self.dtype.name,
  278. self.itemsize, newb.dtype.name,
  279. newb.itemsize))
  280. return newb
  281. def convert(self, copy=True, **kwargs):
  282. """ attempt to coerce any object types to better types
  283. return a copy of the block (if copy = True)
  284. by definition we are not an ObjectBlock here! """
  285. return [self.copy()] if copy else [self]
  286. def _can_hold_element(self, value):
  287. raise NotImplementedError()
  288. def _try_cast(self, value):
  289. raise NotImplementedError()
  290. def _try_cast_result(self, result, dtype=None):
  291. """ try to cast the result to our original type,
  292. we may have roundtripped thru object in the mean-time """
  293. if dtype is None:
  294. dtype = self.dtype
  295. if self.is_integer or self.is_bool or self.is_datetime:
  296. pass
  297. elif self.is_float and result.dtype == self.dtype:
  298. # protect against a bool/object showing up here
  299. if isinstance(dtype, compat.string_types) and dtype == 'infer':
  300. return result
  301. if not isinstance(dtype, type):
  302. dtype = dtype.type
  303. if issubclass(dtype, (np.bool_, np.object_)):
  304. if issubclass(dtype, np.bool_):
  305. if isnull(result).all():
  306. return result.astype(np.bool_)
  307. else:
  308. result = result.astype(np.object_)
  309. result[result == 1] = True
  310. result[result == 0] = False
  311. return result
  312. else:
  313. return result.astype(np.object_)
  314. return result
  315. # may need to change the dtype here
  316. return _possibly_downcast_to_dtype(result, dtype)
  317. def _try_operate(self, values):
  318. """ return a version to operate on as the input """
  319. return values
  320. def _try_coerce_args(self, values, other):
  321. """ provide coercion to our input arguments """
  322. return values, other
  323. def _try_coerce_result(self, result):
  324. """ reverse of try_coerce_args """
  325. return result
  326. def _try_coerce_and_cast_result(self, result, dtype=None):
  327. result = self._try_coerce_result(result)
  328. result = self._try_cast_result(result, dtype=dtype)
  329. return result
  330. def _try_fill(self, value):
  331. return value
  332. def to_native_types(self, slicer=None, na_rep='', **kwargs):
  333. """ convert to our native types format, slicing if desired """
  334. values = self.values
  335. if slicer is not None:
  336. values = values[:, slicer]
  337. values = np.array(values, dtype=object)
  338. mask = isnull(values)
  339. values[mask] = na_rep
  340. return values.tolist()
  341. # block actions ####
  342. def copy(self, deep=True):
  343. values = self.values
  344. if deep:
  345. values = values.copy()
  346. return make_block(values, ndim=self.ndim,
  347. klass=self.__class__, fastpath=True,
  348. placement=self.mgr_locs)
  349. def replace(self, to_replace, value, inplace=False, filter=None,
  350. regex=False):
  351. """ replace the to_replace value with value, possible to create new
  352. blocks here this is just a call to putmask. regex is not used here.
  353. It is used in ObjectBlocks. It is here for API
  354. compatibility."""
  355. mask = com.mask_missing(self.values, to_replace)
  356. if filter is not None:
  357. filtered_out = ~self.mgr_locs.isin(filter)
  358. mask[filtered_out.nonzero()[0]] = False
  359. if not mask.any():
  360. if inplace:
  361. return [self]
  362. return [self.copy()]
  363. return self.putmask(mask, value, inplace=inplace)
  364. def setitem(self, indexer, value):
  365. """ set the value inplace; return a new block (of a possibly different
  366. dtype)
  367. indexer is a direct slice/positional indexer; value must be a
  368. compatible shape
  369. """
  370. # coerce args
  371. values, value = self._try_coerce_args(self.values, value)
  372. arr_value = np.array(value)
  373. # cast the values to a type that can hold nan (if necessary)
  374. if not self._can_hold_element(value):
  375. dtype, _ = com._maybe_promote(arr_value.dtype)
  376. values = values.astype(dtype)
  377. transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x)
  378. values = transf(values)
  379. l = len(values)
  380. # length checking
  381. # boolean with truth values == len of the value is ok too
  382. if isinstance(indexer, (np.ndarray, list)):
  383. if is_list_like(value) and len(indexer) != len(value):
  384. if not (isinstance(indexer, np.ndarray) and
  385. indexer.dtype == np.bool_ and
  386. len(indexer[indexer]) == len(value)):
  387. raise ValueError("cannot set using a list-like indexer "
  388. "with a different length than the value")
  389. # slice
  390. elif isinstance(indexer, slice):
  391. if is_list_like(value) and l:
  392. if len(value) != _length_of_indexer(indexer, values):
  393. raise ValueError("cannot set using a slice indexer with a "
  394. "different length than the value")
  395. try:
  396. # setting a single element for each dim and with a rhs that could be say a list
  397. # GH 6043
  398. if arr_value.ndim == 1 and (
  399. np.isscalar(indexer) or (isinstance(indexer, tuple) and all([ np.isscalar(idx) for idx in indexer ]))):
  400. values[indexer] = value
  401. # if we are an exact match (ex-broadcasting),
  402. # then use the resultant dtype
  403. elif len(arr_value.shape) and arr_value.shape[0] == values.shape[0] and np.prod(arr_value.shape) == np.prod(values.shape):
  404. values[indexer] = value
  405. values = values.astype(arr_value.dtype)
  406. # set
  407. else:
  408. values[indexer] = value
  409. # coerce and try to infer the dtypes of the result
  410. if np.isscalar(value):
  411. dtype, _ = _infer_dtype_from_scalar(value)
  412. else:
  413. dtype = 'infer'
  414. values = self._try_coerce_and_cast_result(values, dtype)
  415. return [make_block(transf(values),
  416. ndim=self.ndim, placement=self.mgr_locs,
  417. fastpath=True)]
  418. except (ValueError, TypeError) as detail:
  419. raise
  420. except Exception as detail:
  421. pass
  422. return [self]
  423. def putmask(self, mask, new, align=True, inplace=False):
  424. """ putmask the data to the block; it is possible that we may create a
  425. new dtype of block
  426. return the resulting block(s)
  427. Parameters
  428. ----------
  429. mask : the condition to respect
  430. new : a ndarray/object
  431. align : boolean, perform alignment on other/cond, default is True
  432. inplace : perform inplace modification, default is False
  433. Returns
  434. -------
  435. a new block(s), the result of the putmask
  436. """
  437. new_values = self.values if inplace else self.values.copy()
  438. # may need to align the new
  439. if hasattr(new, 'reindex_axis'):
  440. new = new.values.T
  441. # may need to align the mask
  442. if hasattr(mask, 'reindex_axis'):
  443. mask = mask.values.T
  444. # if we are passed a scalar None, convert it here
  445. if not is_list_like(new) and isnull(new):
  446. new = self.fill_value
  447. if self._can_hold_element(new):
  448. new = self._try_cast(new)
  449. # pseudo-broadcast
  450. if isinstance(new, np.ndarray) and new.ndim == self.ndim - 1:
  451. new = np.repeat(new, self.shape[-1]).reshape(self.shape)
  452. np.putmask(new_values, mask, new)
  453. # maybe upcast me
  454. elif mask.any():
  455. # need to go column by column
  456. new_blocks = []
  457. if self.ndim > 1:
  458. for i, ref_loc in enumerate(self.mgr_locs):
  459. m = mask[i]
  460. v = new_values[i]
  461. # need a new block
  462. if m.any():
  463. n = new[i] if isinstance(
  464. new, np.ndarray) else np.array(new)
  465. # type of the new block
  466. dtype, _ = com._maybe_promote(n.dtype)
  467. # we need to exiplicty astype here to make a copy
  468. n = n.astype(dtype)
  469. nv = _putmask_smart(v, m, n)
  470. else:
  471. nv = v if inplace else v.copy()
  472. # Put back the dimension that was taken from it and make
  473. # a block out of the result.
  474. block = make_block(values=nv[np.newaxis],
  475. placement=[ref_loc],
  476. fastpath=True)
  477. new_blocks.append(block)
  478. else:
  479. nv = _putmask_smart(new_values, mask, new)
  480. new_blocks.append(make_block(values=nv,
  481. placement=self.mgr_locs,
  482. fastpath=True))
  483. return new_blocks
  484. if inplace:
  485. return [self]
  486. return [make_block(new_values,
  487. placement=self.mgr_locs, fastpath=True)]
  488. def interpolate(self, method='pad', axis=0, index=None,
  489. values=None, inplace=False, limit=None,
  490. fill_value=None, coerce=False, downcast=None, **kwargs):
  491. def check_int_bool(self, inplace):
  492. # Only FloatBlocks will contain NaNs.
  493. # timedelta subclasses IntBlock
  494. if (self.is_bool or self.is_integer) and not self.is_timedelta:
  495. if inplace:
  496. return self
  497. else:
  498. return self.copy()
  499. # a fill na type method
  500. try:
  501. m = com._clean_fill_method(method)
  502. except:
  503. m = None
  504. if m is not None:
  505. r = check_int_bool(self, inplace)
  506. if r is not None:
  507. return r
  508. return self._interpolate_with_fill(method=m,
  509. axis=axis,
  510. inplace=inplace,
  511. limit=limit,
  512. fill_value=fill_value,
  513. coerce=coerce,
  514. downcast=downcast)
  515. # try an interp method
  516. try:
  517. m = com._clean_interp_method(method, **kwargs)
  518. except:
  519. m = None
  520. if m is not None:
  521. r = check_int_bool(self, inplace)
  522. if r is not None:
  523. return r
  524. return self._interpolate(method=m,
  525. index=index,
  526. values=values,
  527. axis=axis,
  528. limit=limit,
  529. fill_value=fill_value,
  530. inplace=inplace,
  531. downcast=downcast,
  532. **kwargs)
  533. raise ValueError("invalid method '{0}' to interpolate.".format(method))
  534. def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
  535. limit=None, fill_value=None, coerce=False,
  536. downcast=None):
  537. """ fillna but using the interpolate machinery """
  538. # if we are coercing, then don't force the conversion
  539. # if the block can't hold the type
  540. if coerce:
  541. if not self._can_hold_na:
  542. if inplace:
  543. return [self]
  544. else:
  545. return [self.copy()]
  546. fill_value = self._try_fill(fill_value)
  547. values = self.values if inplace else self.values.copy()
  548. values = self._try_operate(values)
  549. values = com.interpolate_2d(values,
  550. method=method,
  551. axis=axis,
  552. limit=limit,
  553. fill_value=fill_value,
  554. dtype=self.dtype)
  555. values = self._try_coerce_result(values)
  556. blocks = [make_block(values,
  557. ndim=self.ndim, klass=self.__class__,
  558. fastpath=True, placement=self.mgr_locs)]
  559. return self._maybe_downcast(blocks, downcast)
  560. def _interpolate(self, method=None, index=None, values=None,
  561. fill_value=None, axis=0, limit=None,
  562. inplace=False, downcast=None, **kwargs):
  563. """ interpolate using scipy wrappers """
  564. data = self.values if inplace else self.values.copy()
  565. # only deal with floats
  566. if not self.is_float:
  567. if not self.is_integer:
  568. return self
  569. data = data.astype(np.float64)
  570. if fill_value is None:
  571. fill_value = self.fill_value
  572. if method in ('krogh', 'piecewise_polynomial', 'pchip'):
  573. if not index.is_monotonic:
  574. raise ValueError("{0} interpolation requires that the "
  575. "index be monotonic.".format(method))
  576. # process 1-d slices in the axis direction
  577. def func(x):
  578. # process a 1-d slice, returning it
  579. # should the axis argument be handled below in apply_along_axis?
  580. # i.e. not an arg to com.interpolate_1d
  581. return com.interpolate_1d(index, x, method=method, limit=limit,
  582. fill_value=fill_value,
  583. bounds_error=False, **kwargs)
  584. # interp each column independently
  585. interp_values = np.apply_along_axis(func, axis, data)
  586. blocks = [make_block(interp_values,
  587. ndim=self.ndim, klass=self.__class__,
  588. fastpath=True, placement=self.mgr_locs)]
  589. return self._maybe_downcast(blocks, downcast)
  590. def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):
  591. """
  592. Take values according to indexer and return them as a block.bb
  593. """
  594. if fill_tuple is None:
  595. fill_value = self.fill_value
  596. new_values = com.take_nd(self.get_values(), indexer, axis=axis,
  597. allow_fill=False)
  598. else:
  599. fill_value = fill_tuple[0]
  600. new_values = com.take_nd(self.get_values(), indexer, axis=axis,
  601. allow_fill=True, fill_value=fill_value)
  602. if new_mgr_locs is None:
  603. if axis == 0:
  604. slc = lib.indexer_as_slice(indexer)
  605. if slc is not None:
  606. new_mgr_locs = self.mgr_locs[slc]
  607. else:
  608. new_mgr_locs = self.mgr_locs[indexer]
  609. else:
  610. new_mgr_locs = self.mgr_locs
  611. if new_values.dtype != self.dtype:
  612. return make_block(new_values, new_mgr_locs)
  613. else:
  614. return self.make_block_same_class(new_values, new_mgr_locs)
  615. def get_values(self, dtype=None):
  616. return self.values
  617. def diff(self, n):
  618. """ return block for the diff of the values """
  619. new_values = com.diff(self.values, n, axis=1)
  620. return [make_block(values=new_values,
  621. ndim=self.ndim, fastpath=True,
  622. placement=self.mgr_locs)]
  623. def shift(self, periods, axis=0):
  624. """ shift the block by periods, possibly upcast """
  625. # convert integer to float if necessary. need to do a lot more than
  626. # that, handle boolean etc also
  627. new_values, fill_value = com._maybe_upcast(self.values)
  628. # make sure array sent to np.roll is c_contiguous
  629. f_ordered = new_values.flags.f_contiguous
  630. if f_ordered:
  631. new_values = new_values.T
  632. axis = new_values.ndim - axis - 1
  633. new_values = np.roll(new_values, periods, axis=axis)
  634. axis_indexer = [ slice(None) ] * self.ndim
  635. if periods > 0:
  636. axis_indexer[axis] = slice(None,periods)
  637. else:
  638. axis_indexer[axis] = slice(periods,None)
  639. new_values[tuple(axis_indexer)] = fill_value
  640. # restore original order
  641. if f_ordered:
  642. new_values = new_values.T
  643. return [make_block(new_values,
  644. ndim=self.ndim, fastpath=True,
  645. placement=self.mgr_locs)]
  646. def eval(self, func, other, raise_on_error=True, try_cast=False):
  647. """
  648. evaluate the block; return result block from the result
  649. Parameters
  650. ----------
  651. func : how to combine self, other
  652. other : a ndarray/object
  653. raise_on_error : if True, raise when I can't perform the function,
  654. False by default (and just return the data that we had coming in)
  655. Returns
  656. -------
  657. a new block, the result of the func
  658. """
  659. values = self.values
  660. if hasattr(other, 'reindex_axis'):
  661. other = other.values
  662. # make sure that we can broadcast
  663. is_transposed = False
  664. if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
  665. if values.ndim != other.ndim:
  666. is_transposed = True
  667. else:
  668. if values.shape == other.shape[::-1]:
  669. is_transposed = True
  670. elif values.shape[0] == other.shape[-1]:
  671. is_transposed = True
  672. else:
  673. # this is a broadcast error heree
  674. raise ValueError("cannot broadcast shape [%s] with block "
  675. "values [%s]" % (values.T.shape,
  676. other.shape))
  677. transf = (lambda x: x.T) if is_transposed else (lambda x: x)
  678. # coerce/transpose the args if needed
  679. values, other = self._try_coerce_args(transf(values), other)
  680. # get the result, may need to transpose the other
  681. def get_result(other):
  682. return self._try_coerce_result(func(values, other))
  683. # error handler if we have an issue operating with the function
  684. def handle_error():
  685. if raise_on_error:
  686. raise TypeError('Could not operate %s with block values %s'
  687. % (repr(other), str(detail)))
  688. else:
  689. # return the values
  690. result = np.empty(values.shape, dtype='O')
  691. result.fill(np.nan)
  692. return result
  693. # get the result
  694. try:
  695. result = get_result(other)
  696. # if we have an invalid shape/broadcast error
  697. # GH4576, so raise instead of allowing to pass through
  698. except ValueError as detail:
  699. raise
  700. except Exception as detail:
  701. result = handle_error()
  702. # technically a broadcast error in numpy can 'work' by returning a
  703. # boolean False
  704. if not isinstance(result, np.ndarray):
  705. if not isinstance(result, np.ndarray):
  706. # differentiate between an invalid ndarray-ndarray comparison
  707. # and an invalid type comparison
  708. if isinstance(values, np.ndarray) and is_list_like(other):
  709. raise ValueError('Invalid broadcasting comparison [%s] '
  710. 'with block values' % repr(other))
  711. raise TypeError('Could not compare [%s] with block values'
  712. % repr(other))
  713. # transpose if needed
  714. result = transf(result)
  715. # try to cast if requested
  716. if try_cast:
  717. result = self._try_cast_result(result)
  718. return [make_block(result, ndim=self.ndim,
  719. fastpath=True, placement=self.mgr_locs)]
  720. def where(self, other, cond, align=True, raise_on_error=True,
  721. try_cast=False):
  722. """
  723. evaluate the block; return result block(s) from the result
  724. Parameters
  725. ----------
  726. other : a ndarray/object
  727. cond : the condition to respect
  728. align : boolean, perform alignment on other/cond
  729. raise_on_error : if True, raise when I can't perform the function,
  730. False by default (and just return the data that we had coming in)
  731. Returns
  732. -------
  733. a new block(s), the result of the func
  734. """
  735. values = self.values
  736. # see if we can align other
  737. if hasattr(other, 'reindex_axis'):
  738. other = other.values
  739. # make sure that we can broadcast
  740. is_transposed = False
  741. if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
  742. if values.ndim != other.ndim or values.shape == other.shape[::-1]:
  743. # if its symmetric are ok, no reshaping needed (GH 7506)
  744. if (values.shape[0] == np.array(values.shape)).all():
  745. pass
  746. # pseodo broadcast (its a 2d vs 1d say and where needs it in a
  747. # specific direction)
  748. elif (other.ndim >= 1 and values.ndim - 1 == other.ndim and
  749. values.shape[0] != other.shape[0]):
  750. other = _block_shape(other).T
  751. else:
  752. values = values.T
  753. is_transposed = True
  754. # see if we can align cond
  755. if not hasattr(cond, 'shape'):
  756. raise ValueError(
  757. "where must have a condition that is ndarray like")
  758. if hasattr(cond, 'reindex_axis'):
  759. cond = cond.values
  760. # may need to undo transpose of values
  761. if hasattr(values, 'ndim'):
  762. if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
  763. values = values.T
  764. is_transposed = not is_transposed
  765. # our where function
  766. def func(c, v, o):
  767. if c.ravel().all():
  768. return v
  769. v, o = self._try_coerce_args(v, o)
  770. try:
  771. return self._try_coerce_result(
  772. expressions.where(c, v, o, raise_on_error=True)
  773. )
  774. except Exception as detail:
  775. if raise_on_error:
  776. raise TypeError('Could not operate [%s] with block values '
  777. '[%s]' % (repr(o), str(detail)))
  778. else:
  779. # return the values
  780. result = np.empty(v.shape, dtype='float64')
  781. result.fill(np.nan)
  782. return result
  783. # see if we can operate on the entire block, or need item-by-item
  784. # or if we are a single block (ndim == 1)
  785. result = func(cond, values, other)
  786. if self._can_hold_na or self.ndim == 1:
  787. if not isinstance(result, np.ndarray):
  788. raise TypeError('Could not compare [%s] with block values'
  789. % repr(other))
  790. if is_transposed:
  791. result = result.T
  792. # try to cast if requested
  793. if try_cast:
  794. result = self._try_cast_result(result)
  795. return make_block(result,
  796. ndim=self.ndim, placement=self.mgr_locs)
  797. # might need to separate out blocks
  798. axis = cond.ndim - 1
  799. cond = cond.swapaxes(axis, 0)
  800. mask = np.array([cond[i].all() for i in range(cond.shape[0])],
  801. dtype=bool)
  802. result_blocks = []
  803. for m in [mask, ~mask]:
  804. if m.any():
  805. r = self._try_cast_result(
  806. result.take(m.nonzero()[0], axis=axis))
  807. result_blocks.append(make_block(r.T,
  808. placement=self.mgr_locs[m]))
  809. return result_blocks
  810. def equals(self, other):
  811. if self.dtype != other.dtype or self.shape != other.shape: return False
  812. return np.array_equal(self.values, other.values)
  813. class NumericBlock(Block):
  814. __slots__ = ()
  815. is_numeric = True
  816. _can_hold_na = True
  817. class FloatOrComplexBlock(NumericBlock):
  818. __slots__ = ()
  819. def equals(self, other):
  820. if self.dtype != other.dtype or self.shape != other.shape: return False
  821. left, right = self.values, other.values
  822. return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
  823. class FloatBlock(FloatOrComplexBlock):
  824. __slots__ = ()
  825. is_float = True
  826. _downcast_dtype = 'int64'
  827. def _can_hold_element(self, element):
  828. if is_list_like(element):
  829. element = np.array(element)
  830. tipo = element.dtype.type
  831. return issubclass(tipo, (np.floating, np.integer)) and not issubclass(
  832. tipo, (np.datetime64, np.timedelta64))
  833. return isinstance(element, (float, int, np.float_, np.int_)) and not isinstance(
  834. element, (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64))
  835. def _try_cast(self, element):
  836. try:
  837. return float(element)
  838. except: # pragma: no cover
  839. return element
  840. def to_native_types(self, slicer=None, na_rep='', float_format=None,
  841. **kwargs):
  842. """ convert to our native types format, slicing if desired """
  843. values = self.values
  844. if slicer is not None:
  845. values = values[:, slicer]
  846. values = np.array(values, dtype=object)
  847. mask = isnull(values)
  848. values[mask] = na_rep
  849. if float_format:
  850. imask = (~mask).ravel()
  851. values.flat[imask] = np.array(
  852. [float_format % val for val in values.ravel()[imask]])
  853. return values.tolist()
  854. def should_store(self, value):
  855. # when inserting a column should not coerce integers to floats
  856. # unnecessarily
  857. return (issubclass(value.dtype.type, np.floating) and
  858. value.dtype == self.dtype)
  859. class ComplexBlock(FloatOrComplexBlock):
  860. __slots__ = ()
  861. is_complex = True
  862. def _can_hold_element(self, element):
  863. if is_list_like(element):
  864. element = np.array(element)
  865. return issubclass(element.dtype.type, (np.floating, np.integer, np.complexfloating))
  866. return (isinstance(element, (float, int, complex, np.float_, np.int_)) and
  867. not isinstance(bool, np.bool_))
  868. def _try_cast(self, element):
  869. try:
  870. return complex(element)
  871. except: # pragma: no cover
  872. return element
  873. def should_store(self, value):
  874. return issubclass(value.dtype.type, np.complexfloating)
  875. class IntBlock(NumericBlock):
  876. __slots__ = ()
  877. is_integer = True
  878. _can_hold_na = False
  879. def _can_hold_element(self, element):
  880. if is_list_like(element):
  881. element = np.array(element)
  882. tipo = element.dtype.type
  883. return issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))
  884. return com.is_integer(element)
  885. def _try_cast(self, element):
  886. try:
  887. return int(element)
  888. except: # pragma: no cover
  889. return element
  890. def should_store(self, value):
  891. return com.is_integer_dtype(value) and value.dtype == self.dtype
  892. class TimeDeltaBlock(IntBlock):
  893. __slots__ = ()
  894. is_timedelta = True
  895. _can_hold_na = True
  896. is_numeric = False
  897. @property
  898. def fill_value(self):
  899. return tslib.iNaT
  900. def _try_fill(self, value):
  901. """ if we are a NaT, return the actual fill value """
  902. if isinstance(value, type(tslib.NaT)) or np.array(isnull(value)).all():
  903. value = tslib.iNaT
  904. elif isinstance(value, np.timedelta64):
  905. pass
  906. elif com.is_integer(value):
  907. # coerce to seconds of timedelta
  908. value = np.timedelta64(int(value * 1e9))
  909. elif isinstance(value, timedelta):
  910. value = np.timedelta64(value)
  911. return value
  912. def _try_coerce_args(self, values, other):
  913. """ provide coercion to our input arguments
  914. we are going to compare vs i8, so coerce to floats
  915. repring NaT with np.nan so nans propagate
  916. values is always ndarray like, other may not be """
  917. def masker(v):
  918. mask = isnull(v)
  919. v = v.view('i8').astype('float64')
  920. v[mask] = np.nan
  921. return v
  922. values = masker(values)
  923. if _is_null_datelike_scalar(other):
  924. other = np.nan
  925. elif isinstance(other, np.timedelta64):
  926. other = _coerce_scalar_to_timedelta_type(other, unit='s').item()
  927. if other == tslib.iNaT:
  928. other = np.nan
  929. else:
  930. other = masker(other)
  931. return values, other
  932. def _try_operate(self, values):
  933. """ return a version to operate on """
  934. return values.view('i8')
  935. def _try_coerce_result(self, result):
  936. """ reverse of try_coerce_args / try_operate """
  937. if isinstance(result, np.ndarray):
  938. mask = isnull(result)
  939. if result.dtype.kind in ['i', 'f', 'O']:
  940. result = result.astype('m8[ns]')
  941. result[mask] = tslib.iNaT
  942. elif isinstance(result, np.integer):
  943. result = np.timedelta64(result)
  944. return result
  945. def should_store(self, value):
  946. return issubclass(value.dtype.type, np.timedelta64)
  947. def to_native_types(self, slicer=None, na_rep=None, **kwargs):
  948. """ convert to our native types format, slicing if desired """
  949. values = self.values
  950. if slicer is not None:
  951. values = values[:, slicer]
  952. mask = isnull(values)
  953. rvalues = np.empty(values.shape, dtype=object)
  954. if na_rep is None:
  955. na_rep = 'NaT'
  956. rvalues[mask] = na_rep
  957. imask = (~mask).ravel()
  958. rvalues.flat[imask] = np.array([lib.repr_timedelta64(val)
  959. for val in values.ravel()[imask]],
  960. dtype=object)
  961. return rvalues.tolist()
  962. class BoolBlock(NumericBlock):
  963. __slots__ = ()
  964. is_bool = True
  965. _can_hold_na = False
  966. def _can_hold_element(self, element):
  967. if is_list_like(element):
  968. element = np.array(element)
  969. return issubclass(element.dtype.type, np.integer)
  970. return isinstance(element, (int, bool))
  971. def _try_cast(self, element):
  972. try:
  973. return bool(element)
  974. except: # pragma: no cover
  975. return element
  976. def should_store(self, value):
  977. return issubclass(value.dtype.type, np.bool_)
  978. def replace(self, to_replace, value, inplace=False, filter=None,
  979. regex=False):
  980. to_replace_values = np.atleast_1d(to_replace)
  981. if not np.can_cast(to_replace_values, bool):
  982. return self
  983. return super(BoolBlock, self).replace(to_replace, value,
  984. inplace=inplace, filter=filter,
  985. regex=regex)
  986. class ObjectBlock(Block):
  987. __slots__ = ()
  988. is_object = True
  989. _can_hold_na = True
  990. def __init__(self, values, ndim=2, fastpath=False,
  991. placement=None):
  992. if issubclass(values.dtype.type, compat.string_types):
  993. values = np.array(values, dtype=object)
  994. super(ObjectBlock, self).__init__(values, ndim=ndim,
  995. fastpath=fastpath,
  996. placement=placement)
  997. @property
  998. def is_bool(self):
  999. """ we can be a bool if we have only bool values but are of type
  1000. object
  1001. """
  1002. return lib.is_bool_array(self.values.ravel())
  1003. def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=True,
  1004. copy=True, by_item=True):
  1005. """ attempt to coerce any object types to better types
  1006. return a copy of the block (if copy = True)
  1007. by definition we ARE an ObjectBlock!!!!!
  1008. can return multiple blocks!
  1009. """
  1010. # attempt to create new type blocks
  1011. blocks = []
  1012. if by_item and not self._is_single_block:
  1013. for i, rl in enumerate(self.mgr_locs):
  1014. values = self.iget(i)
  1015. values = com._possibly_convert_objects(
  1016. values.ravel(), convert_dates=convert_dates,
  1017. convert_numeric=convert_numeric,
  1018. convert_timedeltas=convert_timedeltas,
  1019. ).reshape(values.shape)
  1020. values = _block_shape(values, ndim=self.ndim)
  1021. newb = make_block(values,
  1022. ndim=self.ndim, placement=[rl])
  1023. blocks.append(newb)
  1024. else:
  1025. values = com._possibly_convert_objects(
  1026. self.values.ravel(), convert_dates=convert_dates,
  1027. convert_numeric=convert_numeric
  1028. ).reshape(self.values.shape)
  1029. blocks.append(make_block(values,
  1030. ndim=self.ndim, placement=self.mgr_locs))
  1031. return blocks
  1032. def set(self, locs, values, check=False):
  1033. """
  1034. Modify Block in-place with new item value
  1035. Returns
  1036. -------
  1037. None
  1038. """
  1039. # GH6026
  1040. if check:
  1041. try:
  1042. if (self.values[locs] == values).all():
  1043. return
  1044. except:
  1045. pass
  1046. try:
  1047. self.values[locs] = values
  1048. except (ValueError):
  1049. # broadcasting error
  1050. # see GH6171
  1051. new_shape = list(values.shape)
  1052. new_shape[0] = len(self.items)
  1053. self.values = np.empty(tuple(new_shape),dtype=self.dtype)
  1054. self.values.fill(np.nan)
  1055. self.values[locs] = values
  1056. def _maybe_downcast(self, blocks, downcast=None):
  1057. if downcast is not None:
  1058. return blocks
  1059. # split and convert the blocks
  1060. result_blocks = []
  1061. for blk in blocks:
  1062. result_blocks.extend(blk.convert(convert_dates=True,
  1063. convert_numeric=False))
  1064. return result_blocks
  1065. def _can_hold_element(self, element):
  1066. return True
  1067. def _try_cast(self, element):
  1068. return element
  1069. def should_store(self, value):
  1070. return not issubclass(value.dtype.type,
  1071. (np.integer, np.floating, np.complexfloating,
  1072. np.datetime64, np.bool_))
  1073. def replace(self, to_replace, value, inplace=False, filter=None,
  1074. regex=False):
  1075. blk = [self]
  1076. to_rep_is_list = com.is_list_like(to_replace)
  1077. value_is_list = com.is_list_like(value)
  1078. both_lists = to_rep_is_list and value_is_list
  1079. either_list = to_rep_is_list or value_is_list
  1080. if not either_list and com.is_re(to_replace):
  1081. blk[0], = blk[0]._replace_single(to_replace, value,
  1082. inplace=inplace, filter=filter,
  1083. regex=True)
  1084. elif not (either_list or regex):
  1085. blk = super(ObjectBlock, self).replace(to_replace, value,
  1086. inplace=inplace,
  1087. filter=filter, regex=regex)
  1088. elif both_lists:
  1089. for to_rep, v in zip(to_replace, value):
  1090. blk[0], = blk[0]._replace_single(to_rep, v, inplace=inplace,
  1091. filter=filter, regex=regex)
  1092. elif to_rep_is_list and regex:
  1093. for to_rep in to_replace:
  1094. blk[0], = blk[0]._replace_single(to_rep, value,
  1095. inplace=inplace,
  1096. filter=filter, regex=regex)
  1097. else:
  1098. blk[0], = blk[0]._replace_single(to_replace, value,
  1099. inplace=inplace, filter=filter,
  1100. regex=regex)
  1101. return blk
  1102. def _replace_single(self, to_replace, value, inplace=False, filter=None,
  1103. regex=False):
  1104. # to_replace is regex compilable
  1105. to_rep_re = regex and com.is_re_compilable(to_replace)
  1106. # regex is regex compilable
  1107. regex_re = com.is_re_compilable(regex)
  1108. # only one will survive
  1109. if to_rep_re and regex_re:
  1110. raise AssertionError('only one of to_replace and regex can be '
  1111. 'regex compilable')
  1112. # if regex was passed as something that can be a regex (rather than a
  1113. # boolean)
  1114. if regex_re:
  1115. to_replace = regex
  1116. regex = regex_re or to_rep_re
  1117. # try to get the pattern attribute (compiled re) or it's a string
  1118. try:
  1119. pattern = to_replace.pattern
  1120. except AttributeError:
  1121. pattern = to_replace
  1122. # if the pattern is not empty and to_replace is either a string or a
  1123. # regex
  1124. if regex and pattern:
  1125. rx = re.compile(to_replace)
  1126. else:
  1127. # if the thing to replace is not a string or compiled regex call
  1128. # the superclass method -> to_replace is some kind of object
  1129. result = super(ObjectBlock, self).replace(to_replace, value,
  1130. inplace=inplace,
  1131. filter=filter,
  1132. regex=regex)
  1133. if not isinstance(result, list):
  1134. result = [result]
  1135. return result
  1136. new_values = self.values if inplace else self.values.copy()
  1137. # deal with replacing values with objects (strings) that match but
  1138. # whose replacement is not a string (numeric, nan, object)
  1139. if isnull(value) or not isinstance(value, compat.string_types):
  1140. def re_replacer(s):
  1141. try:
  1142. return value if rx.search(s) is not None else s
  1143. except TypeError:
  1144. return s
  1145. else:
  1146. # value is guaranteed to be a string here, s can be either a string
  1147. # or null if it's null it gets returned
  1148. def re_replacer(s):
  1149. try:
  1150. return rx.sub(value, s)
  1151. except TypeError:
  1152. return s
  1153. f = np.vectorize(re_replacer, otypes=[self.dtype])
  1154. if filter is None:
  1155. filt = slice(None)
  1156. else:
  1157. filt = self.mgr_locs.isin(filter).nonzero()[0]
  1158. new_values[filt] = f(new_values[filt])
  1159. return [self if inplace else
  1160. make_block(new_values,
  1161. fastpath=True, placement=self.mgr_locs)]
  1162. class DatetimeBlock(Block):
  1163. __slots__ = ()
  1164. is_datetime = True
  1165. _can_hold_na = True
  1166. def __init__(self, values, placement,
  1167. fastpath=False, **kwargs):
  1168. if values.dtype != _NS_DTYPE:
  1169. values = tslib.cast_to_nanoseconds(values)
  1170. super(DatetimeBlock, self).__init__(values,
  1171. fastpath=True, placement=placement,
  1172. **kwargs)
  1173. def _can_hold_element(self, element):
  1174. if is_list_like(element):
  1175. element = np.array(element)
  1176. return element.dtype == _NS_DTYPE or element.dtype == np.int64
  1177. return (com.is_integer(element) or
  1178. isinstance(element, datetime) or
  1179. isnull(element))
  1180. def _try_cast(self, element):
  1181. try:
  1182. return int(element)
  1183. except:
  1184. return element
  1185. def _try_operate(self, values):
  1186. """ return a version to operate on """
  1187. return values.view('i8')
  1188. def _try_coerce_args(self, values, other):
  1189. """ provide coercion to our input arguments
  1190. we are going to compare vs i8, so coerce to integer
  1191. values is always ndarra like, other may not be """
  1192. values = values.view('i8')
  1193. if _is_null_datelike_scalar(other):
  1194. other = tslib.iNaT
  1195. elif isinstance(other, datetime):
  1196. other = lib.Timestamp(other).asm8.view('i8')
  1197. else:
  1198. other = other.view('i8')
  1199. return values, other
  1200. def _try_coerce_result(self, result):
  1201. """ reverse of try_coerce_args """
  1202. if isinstance(result, np.ndarray):
  1203. if result.dtype == 'i8':
  1204. result = tslib.array_to_datetime(
  1205. result.astype(object).ravel()).reshape(result.shape)
  1206. elif result.dtype.kind in ['i', 'f', 'O']:
  1207. result = result.astype('M8[ns]')
  1208. elif isinstance(result, (np.integer, np.datetime64)):
  1209. result = lib.Timestamp(result)
  1210. return result
  1211. @property
  1212. def fill_value(self):
  1213. return tslib.iNaT
  1214. def _try_fill(self, value):
  1215. """ if we are a NaT, return the actual fill value """
  1216. if isinstance(value, type(tslib.NaT)) or np.array(isnull(value)).all():
  1217. value = tslib.iNaT
  1218. return value
  1219. def fillna(self, value, limit=None,
  1220. inplace=False, downcast=None):
  1221. # straight putmask here
  1222. values = self.values if inplace else self.values.copy()
  1223. mask = isnull(self.values)
  1224. value = self._try_fill(value)
  1225. if limit is not None:
  1226. if self.ndim > 2:
  1227. raise NotImplementedError
  1228. mask[mask.cumsum(self.ndim-1)>limit]=False
  1229. np.putmask(values, mask, value)
  1230. return [self if inplace else
  1231. make_block(values,
  1232. fastpath=True, placement=self.mgr_locs)]
  1233. def to_native_types(self, slicer=None, na_rep=None, date_format=None,
  1234. **kwargs):
  1235. """ convert to our native types format, slicing if desired """
  1236. values = self.values
  1237. if slicer is not None:
  1238. values = values[:, slicer]
  1239. mask = isnull(values)
  1240. rvalues = np.empty(values.shape, dtype=object)
  1241. if na_rep is None:
  1242. na_rep = 'NaT'
  1243. rvalues[mask] = na_rep
  1244. imask = (~mask).ravel()
  1245. if date_format is None:
  1246. date_formatter = lambda x: Timestamp(x)._repr_base
  1247. else:
  1248. date_formatter = lambda x: Timestamp(x).strftime(date_format)
  1249. rvalues.flat[imask] = np.array([date_formatter(val) for val in
  1250. values.ravel()[imask]], dtype=object)
  1251. return rvalues.tolist()
  1252. def should_store(self, value):
  1253. return issubclass(value.dtype.type, np.datetime64)
  1254. def astype(self, dtype, copy=False, raise_on_error=True):
  1255. """
  1256. handle convert to object as a special case
  1257. """
  1258. klass = None
  1259. if np.dtype(dtype).type == np.object_:
  1260. klass = ObjectBlock
  1261. return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
  1262. klass=klass)
  1263. def set(self, locs, values, check=False):
  1264. """
  1265. Modify Block in-place with new item value
  1266. Returns
  1267. -------
  1268. None
  1269. """
  1270. if values.dtype != _NS_DTYPE:
  1271. # Workaround for numpy 1.6 bug
  1272. values = tslib.cast_to_nanoseconds(values)
  1273. self.values[locs] = values
  1274. def get_values(self, dtype=None):
  1275. # return object dtype as Timestamps
  1276. if dtype == object:
  1277. return lib.map_infer(self.values.ravel(), lib.Timestamp)\
  1278. .reshape(self.values.shape)
  1279. return self.values
  1280. class SparseBlock(Block):
  1281. """ implement as a list of sparse arrays of the same dtype """
  1282. __slots__ = ()
  1283. is_sparse = True
  1284. is_numeric = True
  1285. _can_hold_na = True
  1286. _can_consolidate = False
  1287. _verify_integrity = False
  1288. _ftype = 'sparse'
  1289. def __init__(self, values, placement,
  1290. ndim=None, fastpath=False,):
  1291. # Placement must be converted to BlockPlacement via property setter
  1292. # before ndim logic, because placement may be a slice which doesn't
  1293. # have a length.
  1294. self.mgr_locs = placement
  1295. # kludgetastic
  1296. if ndim is None:
  1297. if len(self.mgr_locs) != 1:
  1298. ndim = 1
  1299. else:
  1300. ndim = 2
  1301. self.ndim = ndim
  1302. if not isinstance(values, SparseArray):
  1303. raise TypeError("values must be SparseArray")
  1304. self.values = values
  1305. @property
  1306. def shape(self):
  1307. return (len(self.mgr_locs), self.sp_index.length)
  1308. @property
  1309. def itemsize(self):
  1310. return self.dtype.itemsize
  1311. @property
  1312. def fill_value(self):
  1313. #return np.nan
  1314. return self.values.fill_value
  1315. @fill_value.setter
  1316. def fill_value(self, v):
  1317. # we may need to upcast our fill to match our dtype
  1318. if issubclass(self.dtype.type, np.floating):
  1319. v = float(v)
  1320. self.values.fill_value = v
  1321. @property
  1322. def sp_values(self):
  1323. return self.values.sp_values
  1324. @sp_values.setter
  1325. def sp_values(self, v):
  1326. # reset the sparse values
  1327. self.values = SparseArray(v, sparse_index=self.sp_index,
  1328. kind=self.kind, dtype=v.dtype,
  1329. fill_value=self.values.fill_value,
  1330. copy=False)
  1331. def iget(self, col):
  1332. if col != 0:
  1333. raise IndexError("SparseBlock only contains one item")
  1334. return self.values
  1335. @property
  1336. def sp_index(self):
  1337. return self.values.sp_index
  1338. @property
  1339. def kind(self):
  1340. return self.values.kind
  1341. def __len__(self):
  1342. try:
  1343. return self.sp_index.length
  1344. except:
  1345. return 0
  1346. def should_store(self, value):
  1347. return isinstance(value, SparseArray)
  1348. def set(self, locs, values, check=False):
  1349. assert locs.tolist() == [0]
  1350. self.values = values
  1351. def get(self, item):
  1352. if self.ndim == 1:
  1353. loc = self.items.get_loc(item)
  1354. return self.values[loc]
  1355. else:
  1356. return self.values
  1357. def _slice(self, slicer):
  1358. """ return a slice of my values (but densify first) """
  1359. return self.get_values()[slicer]
  1360. def get_values(self, dtype=None):
  1361. """ need to to_dense myself (and always return a ndim sized object) """
  1362. values = self.values.to_dense()
  1363. if values.ndim == self.ndim - 1:
  1364. values = values.reshape((1,) + values.shape)
  1365. return values
  1366. def copy(self, deep=True):
  1367. return self.make_block_same_class(values=self.values,
  1368. sparse_index=self.sp_index,
  1369. kind=self.kind, copy=deep,
  1370. placement=self.mgr_locs)
  1371. def make_block_same_class(self, values, placement,
  1372. sparse_index=None, kind=None, dtype=None,
  1373. fill_value=None, copy=False, fastpath=True):
  1374. """ return a new block """
  1375. if dtype is None:
  1376. dtype = self.dtype
  1377. if fill_value is None:
  1378. fill_value = self.values.fill_value
  1379. # if not isinstance(values, SparseArray) and values.ndim != self.ndim:
  1380. # raise ValueError("ndim mismatch")
  1381. if values.ndim == 2:
  1382. nitems = values.shape[0]
  1383. if nitems == 0:
  1384. # kludgy, but SparseBlocks cannot handle slices, where the
  1385. # output is 0-item, so let's convert it to a dense block: it
  1386. # won't take space since there's 0 items, plus it will preserve
  1387. # the dtype.
  1388. return make_block(np.empty(values.shape, dtype=dtype),
  1389. placement, fastpath=True,)
  1390. elif nitems > 1:
  1391. raise ValueError("Only 1-item 2d sparse blocks are supported")
  1392. else:
  1393. values = values.reshape(values.shape[1])
  1394. new_values = SparseArray(values, sparse_index=sparse_index,
  1395. kind=kind or self.kind, dtype=dtype,
  1396. fill_value=fill_value, copy=copy)
  1397. return make_block(new_values, ndim=self.ndim,
  1398. fastpath=fastpath, placement=placement)
  1399. def interpolate(self, method='pad', axis=0, inplace=False,
  1400. limit=None, fill_value=None, **kwargs):
  1401. values = com.interpolate_2d(
  1402. self.values.to_dense(), method, axis, limit, fill_value)
  1403. return self.make_block_same_class(values=values,
  1404. placement=self.mgr_locs)
  1405. def fillna(self, value, limit=None, inplace=False, downcast=None):
  1406. # we may need to upcast our fill to match our dtype
  1407. if limit is not None:
  1408. raise NotImplementedError
  1409. if issubclass(self.dtype.type, np.floating):
  1410. value = float(value)
  1411. values = self.values if inplace else self.values.copy()
  1412. return [self.make_block_same_class(values=values.get_values(value),
  1413. fill_value=value,
  1414. placement=self.mgr_locs)]
  1415. def shift(self, periods, axis=0):
  1416. """ shift the block by periods """
  1417. N = len(self.values.T)
  1418. indexer = np.zeros(N, dtype=int)
  1419. if periods > 0:
  1420. indexer[periods:] = np.arange(N - periods)
  1421. else:
  1422. indexer[:periods] = np.arange(-periods, N)
  1423. new_values = self.values.to_dense().take(indexer)
  1424. # convert integer to float if necessary. need to do a lot more than
  1425. # that, handle boolean etc also
  1426. new_values, fill_value = com._maybe_upcast(new_values)
  1427. if periods > 0:
  1428. new_values[:periods] = fill_value
  1429. else:
  1430. new_values[periods:] = fill_value
  1431. return [self.make_block_same_class(new_values, placement=self.mgr_locs)]
  1432. def reindex_axis(self, indexer, method=None, axis=1, fill_value=None,
  1433. limit=None, mask_info=None):
  1434. """
  1435. Reindex using pre-computed indexer information
  1436. """
  1437. if axis < 1:
  1438. raise AssertionError('axis must be at least 1, got %d' % axis)
  1439. # taking on the 0th axis always here
  1440. if fill_value is None:
  1441. fill_value = self.fill_value
  1442. return self.make_block_same_class(self.values.take(indexer),
  1443. fill_value=fill_value,
  1444. placement=self.mgr_locs)
  1445. def sparse_reindex(self, new_index):
  1446. """ sparse reindex and return a new block
  1447. current reindex only works for float64 dtype! """
  1448. values = self.values
  1449. values = values.sp_index.to_int_index().reindex(
  1450. values.sp_values.astype('float64'), values.fill_value, new_index)
  1451. return self.make_block_same_class(values, sparse_index=new_index,
  1452. placement=self.mgr_locs)
  1453. def _try_cast_result(self, result, dtype=None):
  1454. return result
  1455. def make_block(values, placement, klass=None, ndim=None,
  1456. dtype=None, fastpath=False):
  1457. if klass is None:
  1458. dtype = dtype or values.dtype
  1459. vtype = dtype.type
  1460. if isinstance(values, SparseArray):
  1461. klass = SparseBlock
  1462. elif issubclass(vtype, np.floating):
  1463. klass = FloatBlock
  1464. elif (issubclass(vtype, np.integer) and
  1465. issubclass(vtype, np.timedelta64)):
  1466. klass = TimeDeltaBlock
  1467. elif (issubclass(vtype, np.integer) and
  1468. not issubclass(vtype, np.datetime64)):
  1469. klass = IntBlock
  1470. elif dtype == np.bool_:
  1471. klass = BoolBlock
  1472. elif issubclass(vtype, np.datetime64):
  1473. klass = DatetimeBlock
  1474. elif issubclass(vtype, np.complexfloating):
  1475. klass = ComplexBlock
  1476. else:
  1477. # we want to infer here if its a datetimelike if its object type
  1478. # this is pretty strict in that it requires a datetime/timedelta
  1479. # value IN addition to possible nulls/strings
  1480. # an array of ONLY strings will not be inferred
  1481. if np.prod(values.shape):
  1482. result = _possibly_infer_to_datetimelike(values)
  1483. vtype = result.dtype.type
  1484. if issubclass(vtype, np.datetime64):
  1485. klass = DatetimeBlock
  1486. values = result
  1487. elif (issubclass(vtype, np.timedelta64)):
  1488. klass = TimeDeltaBlock
  1489. values = result
  1490. if klass is None:
  1491. klass = ObjectBlock
  1492. return klass(values, ndim=ndim, fastpath=fastpath,
  1493. placement=placement)
  1494. # TODO: flexible with index=None and/or items=None
  1495. class BlockManager(PandasObject):
  1496. """
  1497. Core internal data structure to implement DataFrame
  1498. Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
  1499. lightweight blocked set of labeled data to be manipulated by the DataFrame
  1500. public API class
  1501. Attributes
  1502. ----------
  1503. shape
  1504. ndim
  1505. axes
  1506. values
  1507. items
  1508. Methods
  1509. -------
  1510. set_axis(axis, new_labels)
  1511. copy(deep=True)
  1512. get_dtype_counts
  1513. get_ftype_counts
  1514. get_dtypes
  1515. get_ftypes
  1516. apply(func, axes, block_filter_fn)
  1517. get_bool_data
  1518. get_numeric_data
  1519. get_slice(slice_like, axis)
  1520. get(label)
  1521. iget(loc)
  1522. get_scalar(label_tup)
  1523. take(indexer, axis)
  1524. reindex_axis(new_labels, axis)
  1525. reindex_indexer(new_labels, indexer, axis)
  1526. delete(label)
  1527. insert(loc, label, value)
  1528. set(label, value)
  1529. Parameters
  1530. ----------
  1531. Notes
  1532. -----
  1533. This is *not* a public API class
  1534. """
  1535. __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated',
  1536. '_is_consolidated', '_blknos', '_blklocs']
  1537. def __init__(self, blocks, axes, do_integrity_check=True, fastpath=True):
  1538. self.axes = [_ensure_index(ax) for ax in axes]
  1539. self.blocks = tuple(blocks)
  1540. for block in blocks:
  1541. if block.is_sparse:
  1542. if len(block.mgr_locs) != 1:
  1543. raise AssertionError("Sparse block refers to multiple items")
  1544. else:
  1545. if self.ndim != block.ndim:
  1546. raise AssertionError(('Number of Block dimensions (%d) must '
  1547. 'equal number of axes (%d)')
  1548. % (block.ndim, self.ndim))
  1549. if do_integrity_check:
  1550. self._verify_integrity()
  1551. self._consolidate_check()
  1552. self._rebuild_blknos_and_blklocs()
  1553. def make_empty(self, axes=None):
  1554. """ return an empty BlockManager with the items axis of len 0 """
  1555. if axes is None:
  1556. axes = [_ensure_index([])] + [
  1557. _ensure_index(a) for a in self.axes[1:]
  1558. ]
  1559. # preserve dtype if possible
  1560. if self.ndim == 1:
  1561. blocks = np.array([], dtype=self.dtype)
  1562. else:
  1563. blocks = []
  1564. return self.__class__(blocks, axes)
  1565. def __nonzero__(self):
  1566. return True
  1567. # Python3 compat
  1568. __bool__ = __nonzero__
  1569. @property
  1570. def shape(self):
  1571. return tuple(len(ax) for ax in self.axes)
  1572. @property
  1573. def ndim(self):
  1574. return len(self.axes)
  1575. def set_axis(self, axis, new_labels):
  1576. new_labels = _ensure_index(new_labels)
  1577. old_len = len(self.axes[axis])
  1578. new_len = len(new_labels)
  1579. if new_len != old_len:
  1580. raise ValueError('Length mismatch: Expected axis has %d elements, '
  1581. 'new values have %d elements' % (old_len, new_len))
  1582. self.axes[axis] = new_labels
  1583. def rename_axis(self, mapper, axis, copy=True):
  1584. """
  1585. Rename one of axes.
  1586. Parameters
  1587. ----------
  1588. mapper : unary callable
  1589. axis : int
  1590. copy : boolean, default True
  1591. """
  1592. obj = self.copy(deep=copy)
  1593. obj.set_axis(axis, _transform_index(self.axes[axis], mapper))
  1594. return obj
  1595. def add_prefix(self, prefix):
  1596. f = (str(prefix) + '%s').__mod__
  1597. return self.rename_axis(f, axis=0)
  1598. def add_suffix(self, suffix):
  1599. f = ('%s' + str(suffix)).__mod__
  1600. return self.rename_axis(f, axis=0)
  1601. @property
  1602. def _is_single_block(self):
  1603. if self.ndim == 1:
  1604. return True
  1605. if len(self.blocks) != 1:
  1606. return False
  1607. blk = self.blocks[0]
  1608. return (blk.mgr_locs.is_slice_like and
  1609. blk.mgr_locs.as_slice == slice(0, len(self), 1))
  1610. def _rebuild_blknos_and_blklocs(self):
  1611. """
  1612. Update mgr._blknos / mgr._blklocs.
  1613. """
  1614. new_blknos = np.empty(self.shape[0], dtype=np.int64)
  1615. new_blklocs = np.empty(self.shape[0], dtype=np.int64)
  1616. new_blknos.fill(-1)
  1617. new_blklocs.fill(-1)
  1618. for blkno, blk in enumerate(self.blocks):
  1619. rl = blk.mgr_locs
  1620. new_blknos[rl.indexer] = blkno
  1621. new_blklocs[rl.indexer] = np.arange(len(rl))
  1622. if (new_blknos == -1).any():
  1623. raise AssertionError("Gaps in blk ref_locs")
  1624. self._blknos = new_blknos
  1625. self._blklocs = new_blklocs
  1626. # make items read only for now
  1627. def _get_items(self):
  1628. return self.axes[0]
  1629. items = property(fget=_get_items)
  1630. def _get_counts(self, f):
  1631. """ return a dict of the counts of the function in BlockManager """
  1632. self._consolidate_inplace()
  1633. counts = dict()
  1634. for b in self.blocks:
  1635. v = f(b)
  1636. counts[v] = counts.get(v, 0) + b.shape[0]
  1637. return counts
  1638. def get_dtype_counts(self):
  1639. return self._get_counts(lambda b: b.dtype.name)
  1640. def get_ftype_counts(self):
  1641. return self._get_counts(lambda b: b.ftype)
  1642. def get_dtypes(self):
  1643. dtypes = np.array([blk.dtype for blk in self.blocks])
  1644. return com.take_1d(dtypes, self._blknos, allow_fill=False)
  1645. def get_ftypes(self):
  1646. ftypes = np.array([blk.ftype for blk in self.blocks])
  1647. return com.take_1d(ftypes, self._blknos, allow_fill=False)
  1648. def __getstate__(self):
  1649. block_values = [b.values for b in self.blocks]
  1650. block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
  1651. axes_array = [ax for ax in self.axes]
  1652. extra_state = {
  1653. '0.14.1': {
  1654. 'axes': axes_array,
  1655. 'blocks': [dict(values=b.values,
  1656. mgr_locs=b.mgr_locs.indexer)
  1657. for b in self.blocks]
  1658. }
  1659. }
  1660. # First three elements of the state are to maintain forward
  1661. # compatibility with 0.13.1.
  1662. return axes_array, block_values, block_items, extra_state
  1663. def __setstate__(self, state):
  1664. def unpickle_block(values, mgr_locs):
  1665. # numpy < 1.7 pickle compat
  1666. if values.dtype == 'M8[us]':
  1667. values = values.astype('M8[ns]')
  1668. return make_block(values, placement=mgr_locs)
  1669. if (isinstance(state, tuple) and len(state) >= 4
  1670. and '0.14.1' in state[3]):
  1671. state = state[3]['0.14.1']
  1672. self.axes = [_ensure_index(ax) for ax in state['axes']]
  1673. self.blocks = tuple(
  1674. unpickle_block(b['values'], b['mgr_locs'])
  1675. for b in state['blocks'])
  1676. else:
  1677. # discard anything after 3rd, support beta pickling format for a
  1678. # little while longer
  1679. ax_arrays, bvalues, bitems = state[:3]
  1680. self.axes = [_ensure_index(ax) for ax in ax_arrays]
  1681. self.blocks = tuple(
  1682. unpickle_block(values,
  1683. self.axes[0].get_indexer(items))
  1684. for values, items in zip(bvalues, bitems))
  1685. self._post_setstate()
  1686. def _post_setstate(self):
  1687. self._is_consolidated = False
  1688. self._known_consolidated = False
  1689. self._rebuild_blknos_and_blklocs()
  1690. def __len__(self):
  1691. return len(self.items)
  1692. def __unicode__(self):
  1693. output = com.pprint_thing(self.__class__.__name__)
  1694. for i, ax in enumerate(self.axes):
  1695. if i == 0:
  1696. output += u('\nItems: %s') % ax
  1697. else:
  1698. output += u('\nAxis %d: %s') % (i, ax)
  1699. for block in self.blocks:
  1700. output += u('\n%s') % com.pprint_thing(block)
  1701. return output
  1702. def _verify_integrity(self):
  1703. mgr_shape = self.shape
  1704. tot_items = sum(len(x.mgr_locs) for x in self.blocks)
  1705. for block in self.blocks:
  1706. if not block.is_sparse and block.shape[1:] != mgr_shape[1:]:
  1707. construction_error(tot_items, block.shape[1:], self.axes)
  1708. if len(self.items) != tot_items:
  1709. raise AssertionError('Number of manager items must equal union of '
  1710. 'block items\n# manager items: {0}, # '
  1711. 'tot_items: {1}'.format(len(self.items),
  1712. tot_items))
  1713. def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs):
  1714. """
  1715. iterate over the blocks, collect and create a new block manager
  1716. Parameters
  1717. ----------
  1718. f : the callable or function name to operate on at the block level
  1719. axes : optional (if not supplied, use self.axes)
  1720. filter : list, if supplied, only call the block if the filter is in
  1721. the block
  1722. do_integrity_check : boolean, default False. Do the block manager integrity check
  1723. Returns
  1724. -------
  1725. Block Manager (new object)
  1726. """
  1727. result_blocks = []
  1728. # filter kwarg is used in replace-* family of methods
  1729. if filter is not None:
  1730. filter_locs = set(self.items.get_indexer_for(filter))
  1731. if len(filter_locs) == len(self.items):
  1732. # All items are included, as if there were no filtering
  1733. filter = None
  1734. else:
  1735. kwargs['filter'] = filter_locs
  1736. if f == 'where' and kwargs.get('align', True):
  1737. align_copy = True
  1738. align_keys = ['other', 'cond']
  1739. elif f == 'putmask' and kwargs.get('align', True):
  1740. align_copy = False
  1741. align_keys = ['new', 'mask']
  1742. elif f == 'eval':
  1743. align_copy = False
  1744. align_keys = ['other']
  1745. elif f == 'fillna':
  1746. # fillna internally does putmask, maybe it's better to do this
  1747. # at mgr, not block level?
  1748. align_copy = False
  1749. align_keys = ['value']
  1750. else:
  1751. align_keys = []
  1752. aligned_args = dict((k, kwargs[k]) for k in align_keys
  1753. if hasattr(kwargs[k], 'reindex_axis'))
  1754. for b in self.blocks:
  1755. if filter is not None:
  1756. if not b.mgr_locs.isin(filter_locs).any():
  1757. result_blocks.append(b)
  1758. continue
  1759. if aligned_args:
  1760. b_items = self.items[b.mgr_locs.indexer]
  1761. for k, obj in aligned_args.items():
  1762. axis = getattr(obj, '_info_axis_number', 0)
  1763. kwargs[k] = obj.reindex_axis(b_items, axis=axis,
  1764. copy=align_copy)
  1765. applied = getattr(b, f)(**kwargs)
  1766. if isinstance(applied, list):
  1767. result_blocks.extend(applied)
  1768. else:
  1769. result_blocks.append(applied)
  1770. if len(result_blocks) == 0:
  1771. return self.make_empty(axes or self.axes)
  1772. bm = self.__class__(result_blocks, axes or self.axes,
  1773. do_integrity_check=do_integrity_check)
  1774. bm._consolidate_inplace()
  1775. return bm
  1776. def isnull(self, **kwargs):
  1777. return self.apply('apply', **kwargs)
  1778. def where(self, **kwargs):
  1779. return self.apply('where', **kwargs)
  1780. def eval(self, **kwargs):
  1781. return self.apply('eval', **kwargs)
  1782. def setitem(self, **kwargs):
  1783. return self.apply('setitem', **kwargs)
  1784. def putmask(self, **kwargs):
  1785. return self.apply('putmask', **kwargs)
  1786. def diff(self, **kwargs):
  1787. return self.apply('diff', **kwargs)
  1788. def interpolate(self, **kwargs):
  1789. return self.apply('interpolate', **kwargs)
  1790. def shift(self, **kwargs):
  1791. return self.apply('shift', **kwargs)
  1792. def fillna(self, **kwargs):
  1793. return self.apply('fillna', **kwargs)
  1794. def downcast(self, **kwargs):
  1795. return self.apply('downcast', **kwargs)
  1796. def astype(self, dtype, **kwargs):
  1797. return self.apply('astype', dtype=dtype, **kwargs)
  1798. def convert(self, **kwargs):
  1799. return self.apply('convert', **kwargs)
  1800. def replace(self, **kwargs):
  1801. return self.apply('replace', **kwargs)
  1802. def replace_list(self, src_list, dest_list, inplace=False, regex=False):
  1803. """ do a list replace """
  1804. # figure out our mask a-priori to avoid repeated replacements
  1805. values = self.as_matrix()
  1806. def comp(s):
  1807. if isnull(s):
  1808. return isnull(values)
  1809. return _possibly_compare(values, getattr(s, 'asm8', s),
  1810. operator.eq)
  1811. masks = [comp(s) for i, s in enumerate(src_list)]
  1812. result_blocks = []
  1813. for blk in self.blocks:
  1814. # its possible to get multiple result blocks here
  1815. # replace ALWAYS will return a list
  1816. rb = [blk if inplace else blk.copy()]
  1817. for i, (s, d) in enumerate(zip(src_list, dest_list)):
  1818. new_rb = []
  1819. for b in rb:
  1820. if b.dtype == np.object_:
  1821. result = b.replace(s, d, inplace=inplace,
  1822. regex=regex)
  1823. if isinstance(result, list):
  1824. new_rb.extend(result)
  1825. else:
  1826. new_rb.append(result)
  1827. else:
  1828. # get our mask for this element, sized to this
  1829. # particular block
  1830. m = masks[i][b.mgr_locs.indexer]
  1831. if m.any():
  1832. new_rb.extend(b.putmask(m, d, inplace=True))
  1833. else:
  1834. new_rb.append(b)
  1835. rb = new_rb
  1836. result_blocks.extend(rb)
  1837. bm = self.__class__(result_blocks, self.axes)
  1838. bm._consolidate_inplace()
  1839. return bm
  1840. def is_consolidated(self):
  1841. """
  1842. Return True if more than one block with the same dtype
  1843. """
  1844. if not self._known_consolidated:
  1845. self._consolidate_check()
  1846. return self._is_consolidated
  1847. def _consolidate_check(self):
  1848. ftypes = [blk.ftype for blk in self.blocks]
  1849. self._is_consolidated = len(ftypes) == len(set(ftypes))
  1850. self._known_consolidated = True
  1851. @property
  1852. def is_mixed_type(self):
  1853. # Warning, consolidation needs to get checked upstairs
  1854. self._consolidate_inplace()
  1855. return len(self.blocks) > 1
  1856. @property
  1857. def is_numeric_mixed_type(self):
  1858. # Warning, consolidation needs to get checked upstairs
  1859. self._consolidate_inplace()
  1860. return all([block.is_numeric for block in self.blocks])
  1861. @property
  1862. def is_datelike_mixed_type(self):
  1863. # Warning, consolidation needs to get checked upstairs
  1864. self._consolidate_inplace()
  1865. return any([block.is_datelike for block in self.blocks])
  1866. @property
  1867. def is_view(self):
  1868. """ return a boolean if we are a single block and are a view """
  1869. if len(self.blocks) == 1:
  1870. return self.blocks[0].values.base is not None
  1871. return False
  1872. def get_bool_data(self, copy=False):
  1873. """
  1874. Parameters
  1875. ----------
  1876. copy : boolean, default False
  1877. Whether to copy the blocks
  1878. """
  1879. self._consolidate_inplace()
  1880. return self.combine([b for b in self.blocks if b.is_bool], copy)
  1881. def get_numeric_data(self, copy=False):
  1882. """
  1883. Parameters
  1884. ----------
  1885. copy : boolean, default False
  1886. Whether to copy the blocks
  1887. """
  1888. self._consolidate_inplace()
  1889. return self.combine([b for b in self.blocks if b.is_numeric], copy)
  1890. def combine(self, blocks, copy=True):
  1891. """ return a new manager with the blocks """
  1892. if len(blocks) == 0:
  1893. return self.make_empty()
  1894. # FIXME: optimization potential
  1895. indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
  1896. inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
  1897. new_items = self.items.take(indexer)
  1898. new_blocks = []
  1899. for b in blocks:
  1900. b = b.copy(deep=copy)
  1901. b.mgr_locs = com.take_1d(inv_indexer, b.mgr_locs.as_array, axis=0,
  1902. allow_fill=False)
  1903. new_blocks.append(b)
  1904. new_axes = list(self.axes)
  1905. new_axes[0] = new_items
  1906. return self.__class__(new_blocks, new_axes, do_integrity_check=False)
  1907. def get_slice(self, slobj, axis=0):
  1908. if axis >= self.ndim:
  1909. raise IndexError("Requested axis not found in manager")
  1910. if axis == 0:
  1911. new_blocks = self._slice_take_blocks_ax0(slobj)
  1912. else:
  1913. slicer = [slice(None)] * (axis + 1)
  1914. slicer[axis] = slobj
  1915. slicer = tuple(slicer)
  1916. new_blocks = [blk.getitem_block(slicer) for blk in self.blocks]
  1917. new_axes = list(self.axes)
  1918. new_axes[axis] = new_axes[axis][slobj]
  1919. bm = self.__class__(new_blocks, new_axes, do_integrity_check=False,
  1920. fastpath=True)
  1921. bm._consolidate_inplace()
  1922. return bm
  1923. def __contains__(self, item):
  1924. return item in self.items
  1925. @property
  1926. def nblocks(self):
  1927. return len(self.blocks)
  1928. def copy(self, deep=True):
  1929. """
  1930. Make deep or shallow copy of BlockManager
  1931. Parameters
  1932. ----------
  1933. deep : boolean, default True
  1934. If False, return shallow copy (do not copy data)
  1935. Returns
  1936. -------
  1937. copy : BlockManager
  1938. """
  1939. if deep:
  1940. new_axes = [ax.view() for ax in self.axes]
  1941. else:
  1942. new_axes = list(self.axes)
  1943. return self.apply('copy', axes=new_axes, deep=deep,
  1944. do_integrity_check=False)
  1945. def as_matrix(self, items=None):
  1946. if len(self.blocks) == 0:
  1947. return np.empty(self.shape, dtype=float)
  1948. if items is not None:
  1949. mgr = self.reindex_axis(items, axis=0)
  1950. else:
  1951. mgr = self
  1952. if self._is_single_block:
  1953. return mgr.blocks[0].get_values()
  1954. else:
  1955. return mgr._interleave()
  1956. def _interleave(self):
  1957. """
  1958. Return ndarray from blocks with specified item order
  1959. Items must be contained in the blocks
  1960. """
  1961. dtype = _interleaved_dtype(self.blocks)
  1962. result = np.empty(self.shape, dtype=dtype)
  1963. if result.shape[0] == 0:
  1964. # Workaround for numpy 1.7 bug:
  1965. #
  1966. # >>> a = np.empty((0,10))
  1967. # >>> a[slice(0,0)]
  1968. # array([], shape=(0, 10), dtype=float64)
  1969. # >>> a[[]]
  1970. # Traceback (most recent call last):
  1971. # File "<stdin>", line 1, in <module>
  1972. # IndexError: index 0 is out of bounds for axis 0 with size 0
  1973. return result
  1974. itemmask = np.zeros(self.shape[0])
  1975. for blk in self.blocks:
  1976. rl = blk.mgr_locs
  1977. result[rl.indexer] = blk.get_values(dtype)
  1978. itemmask[rl.indexer] = 1
  1979. if not itemmask.all():
  1980. raise AssertionError('Some items were not contained in blocks')
  1981. return result
  1982. def xs(self, key, axis=1, copy=True, takeable=False):
  1983. if axis < 1:
  1984. raise AssertionError('Can only take xs across axis >= 1, got %d'
  1985. % axis)
  1986. # take by position
  1987. if takeable:
  1988. loc = key
  1989. else:
  1990. loc = self.axes[axis].get_loc(key)
  1991. slicer = [slice(None, None) for _ in range(self.ndim)]
  1992. slicer[axis] = loc
  1993. slicer = tuple(slicer)
  1994. new_axes = list(self.axes)
  1995. # could be an array indexer!
  1996. if isinstance(loc, (slice, np.ndarray)):
  1997. new_axes[axis] = new_axes[axis][loc]
  1998. else:
  1999. new_axes.pop(axis)
  2000. new_blocks = []
  2001. if len(self.blocks) > 1:
  2002. # we must copy here as we are mixed type
  2003. for blk in self.blocks:
  2004. newb = make_block(values=blk.values[slicer],
  2005. klass=blk.__class__, fastpath=True,
  2006. placement=blk.mgr_locs)
  2007. new_blocks.append(newb)
  2008. elif len(self.blocks) == 1:
  2009. block = self.blocks[0]
  2010. vals = block.values[slicer]
  2011. if copy:
  2012. vals = vals.copy()
  2013. new_blocks = [make_block(values=vals, placement=block.mgr_locs,
  2014. klass=block.__class__, fastpath=True,)]
  2015. return self.__class__(new_blocks, new_axes)
  2016. def fast_xs(self, loc):
  2017. """
  2018. get a cross sectional for a given location in the
  2019. items ; handle dups
  2020. return the result, is *could* be a view in the case of a
  2021. single block
  2022. """
  2023. if len(self.blocks) == 1:
  2024. return self.blocks[0].values[:, loc]
  2025. items = self.items
  2026. # non-unique (GH4726)
  2027. if not items.is_unique:
  2028. result = self._interleave()
  2029. if self.ndim == 2:
  2030. result = result.T
  2031. return result[loc]
  2032. # unique
  2033. dtype = _interleaved_dtype(self.blocks)
  2034. n = len(items)
  2035. result = np.empty(n, dtype=dtype)
  2036. for blk in self.blocks:
  2037. # Such assignment may incorrectly coerce NaT to None
  2038. # result[blk.mgr_locs] = blk._slice((slice(None), loc))
  2039. for i, rl in enumerate(blk.mgr_locs):
  2040. result[rl] = blk._try_coerce_result(blk.iget((i, loc)))
  2041. return result
  2042. def consolidate(self):
  2043. """
  2044. Join together blocks having same dtype
  2045. Returns
  2046. -------
  2047. y : BlockManager
  2048. """
  2049. if self.is_consolidated():
  2050. return self
  2051. bm = self.__class__(self.blocks, self.axes)
  2052. bm._consolidate_inplace()
  2053. return bm
  2054. def _consolidate_inplace(self):
  2055. if not self.is_consolidated():
  2056. self.blocks = tuple(_consolidate(self.blocks))
  2057. self._is_consolidated = True
  2058. self._known_consolidated = True
  2059. self._rebuild_blknos_and_blklocs()
  2060. def get(self, item, fastpath=True):
  2061. """
  2062. Return values for selected item (ndarray or BlockManager).
  2063. """
  2064. if self.items.is_unique:
  2065. if not isnull(item):
  2066. loc = self.items.get_loc(item)
  2067. else:
  2068. indexer = np.arange(len(self.items))[isnull(self.items)]
  2069. # allow a single nan location indexer
  2070. if not np.isscalar(indexer):
  2071. if len(indexer) == 1:
  2072. loc = indexer.item()
  2073. else:
  2074. raise ValueError("cannot label index with a null key")
  2075. return self.iget(loc, fastpath=fastpath)
  2076. else:
  2077. if isnull(item):
  2078. raise ValueError("cannot label index with a null key")
  2079. indexer = self.items.get_indexer_for([item])
  2080. return self.reindex_indexer(new_axis=self.items[indexer],
  2081. indexer=indexer, axis=0, allow_dups=True)
  2082. def iget(self, i, fastpath=True):
  2083. """
  2084. Return the data as a SingleBlockManager if fastpath=True and possible
  2085. Otherwise return as a ndarray
  2086. """
  2087. block = self.blocks[self._blknos[i]]
  2088. values = block.iget(self._blklocs[i])
  2089. if not fastpath or block.is_sparse or values.ndim != 1:
  2090. return values
  2091. # fastpath shortcut for select a single-dim from a 2-dim BM
  2092. return SingleBlockManager([ block.make_block_same_class(values,
  2093. placement=slice(0, len(values)),
  2094. fastpath=True) ],
  2095. self.axes[1])
  2096. def get_scalar(self, tup):
  2097. """
  2098. Retrieve single item
  2099. """
  2100. full_loc = list(ax.get_loc(x)
  2101. for ax, x in zip(self.axes, tup))
  2102. blk = self.blocks[self._blknos[full_loc[0]]]
  2103. full_loc[0] = self._blklocs[full_loc[0]]
  2104. # FIXME: this may return non-upcasted types?
  2105. return blk.values[tuple(full_loc)]
  2106. def delete(self, item):
  2107. """
  2108. Delete selected item (items if non-unique) in-place.
  2109. """
  2110. indexer = self.items.get_loc(item)
  2111. is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
  2112. is_deleted[indexer] = True
  2113. ref_loc_offset = -is_deleted.cumsum()
  2114. is_blk_deleted = [False] * len(self.blocks)
  2115. if isinstance(indexer, int):
  2116. affected_start = indexer
  2117. else:
  2118. affected_start = is_deleted.nonzero()[0][0]
  2119. for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]):
  2120. blk = self.blocks[blkno]
  2121. bml = blk.mgr_locs
  2122. blk_del = is_deleted[bml.indexer].nonzero()[0]
  2123. if len(blk_del) == len(bml):
  2124. is_blk_deleted[blkno] = True
  2125. continue
  2126. elif len(blk_del) != 0:
  2127. blk.delete(blk_del)
  2128. bml = blk.mgr_locs
  2129. blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer])
  2130. # FIXME: use Index.delete as soon as it uses fastpath=True
  2131. self.axes[0] = self.items[~is_deleted]
  2132. self.blocks = tuple(b for blkno, b in enumerate(self.blocks)
  2133. if not is_blk_deleted[blkno])
  2134. self._shape = None
  2135. self._rebuild_blknos_and_blklocs()
  2136. def set(self, item, value, check=False):
  2137. """
  2138. Set new item in-place. Does not consolidate. Adds new Block if not
  2139. contained in the current set of items
  2140. if check, then validate that we are not setting the same data in-place
  2141. """
  2142. # FIXME: refactor, clearly separate broadcasting & zip-like assignment
  2143. value_is_sparse = isinstance(value, SparseArray)
  2144. if value_is_sparse:
  2145. assert self.ndim == 2
  2146. def value_getitem(placement):
  2147. return value
  2148. else:
  2149. if value.ndim == self.ndim - 1:
  2150. value = value.reshape((1,) + value.shape)
  2151. def value_getitem(placement):
  2152. return value
  2153. else:
  2154. def value_getitem(placement):
  2155. return value[placement.indexer]
  2156. if value.shape[1:] != self.shape[1:]:
  2157. raise AssertionError('Shape of new values must be compatible '
  2158. 'with manager shape')
  2159. try:
  2160. loc = self.items.get_loc(item)
  2161. except KeyError:
  2162. # This item wasn't present, just insert at end
  2163. self.insert(len(self.items), item, value)
  2164. return
  2165. if isinstance(loc, int):
  2166. loc = [loc]
  2167. blknos = self._blknos[loc]
  2168. blklocs = self._blklocs[loc]
  2169. unfit_mgr_locs = []
  2170. unfit_val_locs = []
  2171. removed_blknos = []
  2172. for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks),
  2173. group=True):
  2174. blk = self.blocks[blkno]
  2175. blk_locs = blklocs[val_locs.indexer]
  2176. if blk.should_store(value):
  2177. blk.set(blk_locs, value_getitem(val_locs), check=check)
  2178. else:
  2179. unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
  2180. unfit_val_locs.append(val_locs)
  2181. # If all block items are unfit, schedule the block for removal.
  2182. if len(val_locs) == len(blk.mgr_locs):
  2183. removed_blknos.append(blkno)
  2184. else:
  2185. self._blklocs[blk.mgr_locs.indexer] = -1
  2186. blk.delete(blk_locs)
  2187. self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk))
  2188. if len(removed_blknos):
  2189. # Remove blocks & update blknos accordingly
  2190. is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
  2191. is_deleted[removed_blknos] = True
  2192. new_blknos = np.empty(self.nblocks, dtype=np.int64)
  2193. new_blknos.fill(-1)
  2194. new_blknos[~is_deleted] = np.arange(self.nblocks -
  2195. len(removed_blknos))
  2196. self._blknos = com.take_1d(new_blknos, self._blknos, axis=0,
  2197. allow_fill=False)
  2198. self.blocks = tuple(blk for i, blk in enumerate(self.blocks)
  2199. if i not in set(removed_blknos))
  2200. if unfit_val_locs:
  2201. unfit_mgr_locs = np.concatenate(unfit_mgr_locs)
  2202. unfit_count = len(unfit_mgr_locs)
  2203. new_blocks = []
  2204. if value_is_sparse:
  2205. # This code (ab-)uses the fact that sparse blocks contain only
  2206. # one item.
  2207. new_blocks.extend(
  2208. make_block(values=value.copy(), ndim=self.ndim,
  2209. placement=slice(mgr_loc, mgr_loc + 1))
  2210. for mgr_loc in unfit_mgr_locs)
  2211. self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) +
  2212. len(self.blocks))
  2213. self._blklocs[unfit_mgr_locs] = 0
  2214. else:
  2215. # unfit_val_locs contains BlockPlacement objects
  2216. unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
  2217. new_blocks.append(
  2218. make_block(values=value_getitem(unfit_val_items),
  2219. ndim=self.ndim, placement=unfit_mgr_locs))
  2220. self._blknos[unfit_mgr_locs] = len(self.blocks)
  2221. self._blklocs[unfit_mgr_locs] = np.arange(unfit_count)
  2222. self.blocks += tuple(new_blocks)
  2223. # Newly created block's dtype may already be present.
  2224. self._known_consolidated = False
  2225. def insert(self, loc, item, value, allow_duplicates=False):
  2226. """
  2227. Insert item at selected position.
  2228. Parameters
  2229. ----------
  2230. loc : int
  2231. item : hashable
  2232. value : array_like
  2233. allow_duplicates: bool
  2234. If False, trying to insert non-unique item will raise
  2235. """
  2236. if not allow_duplicates and item in self.items:
  2237. # Should this be a different kind of error??
  2238. raise ValueError('cannot insert %s, already exists' % item)
  2239. if not isinstance(loc, int):
  2240. raise TypeError("loc must be int")
  2241. block = make_block(values=value,
  2242. ndim=self.ndim,
  2243. placement=slice(loc, loc+1))
  2244. for blkno, count in _fast_count_smallints(self._blknos[loc:]):
  2245. blk = self.blocks[blkno]
  2246. if count == len(blk.mgr_locs):
  2247. blk.mgr_locs = blk.mgr_locs.add(1)
  2248. else:
  2249. new_mgr_locs = blk.mgr_locs.as_array.copy()
  2250. new_mgr_locs[new_mgr_locs >= loc] += 1
  2251. blk.mgr_locs = new_mgr_locs
  2252. if loc == self._blklocs.shape[0]:
  2253. # np.append is a lot faster (at least in numpy 1.7.1), let's use it
  2254. # if we can.
  2255. self._blklocs = np.append(self._blklocs, 0)
  2256. self._blknos = np.append(self._blknos, len(self.blocks))
  2257. else:
  2258. self._blklocs = np.insert(self._blklocs, loc, 0)
  2259. self._blknos = np.insert(self._blknos, loc, len(self.blocks))
  2260. self.axes[0] = self.items.insert(loc, item)
  2261. self.blocks += (block,)
  2262. self._shape = None
  2263. self._known_consolidated = False
  2264. if len(self.blocks) > 100:
  2265. self._consolidate_inplace()
  2266. def reindex_axis(self, new_index, axis, method=None, limit=None,
  2267. fill_value=None, copy=True):
  2268. """
  2269. Conform block manager to new index.
  2270. """
  2271. new_index = _ensure_index(new_index)
  2272. new_index, indexer = self.axes[axis].reindex(
  2273. new_index, method=method, limit=limit, copy_if_needed=True)
  2274. return self.reindex_indexer(new_index, indexer, axis=axis,
  2275. fill_value=fill_value, copy=copy)
  2276. def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
  2277. allow_dups=False, copy=True):
  2278. """
  2279. Parameters
  2280. ----------
  2281. new_axis : Index
  2282. indexer : ndarray of int64 or None
  2283. axis : int
  2284. fill_value : object
  2285. allow_dups : bool
  2286. pandas-indexer with -1's only.
  2287. """
  2288. if indexer is None:
  2289. if new_axis is self.axes[axis] and not copy:
  2290. return self
  2291. result = self.copy(deep=copy)
  2292. result.axes = list(self.axes)
  2293. result.axes[axis] = new_axis
  2294. return result
  2295. self._consolidate_inplace()
  2296. # trying to reindex on an axis with duplicates
  2297. if (not allow_dups and not self.axes[axis].is_unique
  2298. and len(indexer)):
  2299. raise ValueError("cannot reindex from a duplicate axis")
  2300. if axis >= self.ndim:
  2301. raise IndexError("Requested axis not found in manager")
  2302. if axis == 0:
  2303. new_blocks = self._slice_take_blocks_ax0(
  2304. indexer, fill_tuple=(fill_value,))
  2305. else:
  2306. new_blocks = [blk.take_nd(indexer, axis=axis,
  2307. fill_tuple=(fill_value if fill_value is not None else
  2308. blk.fill_value,))
  2309. for blk in self.blocks]
  2310. new_axes = list(self.axes)
  2311. new_axes[axis] = new_axis
  2312. return self.__class__(new_blocks, new_axes)
  2313. def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
  2314. """
  2315. Slice/take blocks along axis=0.
  2316. Overloaded for SingleBlock
  2317. Returns
  2318. -------
  2319. new_blocks : list of Block
  2320. """
  2321. allow_fill = fill_tuple is not None
  2322. sl_type, slobj, sllen = _preprocess_slice_or_indexer(
  2323. slice_or_indexer, self.shape[0], allow_fill=allow_fill)
  2324. if self._is_single_block:
  2325. blk = self.blocks[0]
  2326. if sl_type in ('slice', 'mask'):
  2327. return [blk.getitem_block(slobj,
  2328. new_mgr_locs=slice(0, sllen))]
  2329. elif not allow_fill or self.ndim == 1:
  2330. if allow_fill and fill_tuple[0] is None:
  2331. _, fill_value = com._maybe_promote(blk.dtype)
  2332. fill_tuple = (fill_value,)
  2333. return [blk.take_nd(slobj, axis=0,
  2334. new_mgr_locs=slice(0, sllen),
  2335. fill_tuple=fill_tuple)]
  2336. if sl_type in ('slice', 'mask'):
  2337. blknos = self._blknos[slobj]
  2338. blklocs = self._blklocs[slobj]
  2339. else:
  2340. blknos = com.take_1d(self._blknos, slobj, fill_value=-1,
  2341. allow_fill=allow_fill)
  2342. blklocs = com.take_1d(self._blklocs, slobj, fill_value=-1,
  2343. allow_fill=allow_fill)
  2344. # When filling blknos, make sure blknos is updated before appending to
  2345. # blocks list, that way new blkno is exactly len(blocks).
  2346. #
  2347. # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order,
  2348. # pytables serialization will break otherwise.
  2349. blocks = []
  2350. for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks),
  2351. group=True):
  2352. if blkno == -1:
  2353. # If we've got here, fill_tuple was not None.
  2354. fill_value = fill_tuple[0]
  2355. blocks.append(self._make_na_block(
  2356. placement=mgr_locs, fill_value=fill_value))
  2357. else:
  2358. blk = self.blocks[blkno]
  2359. # Otherwise, slicing along items axis is necessary.
  2360. if blk.is_sparse:
  2361. # A sparse block, it's easy, because there's only one item
  2362. # and each mgr loc is a copy of that single item.
  2363. for mgr_loc in mgr_locs:
  2364. newblk = blk.copy(deep=True)
  2365. newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1)
  2366. blocks.append(newblk)
  2367. else:
  2368. blocks.append(blk.take_nd(
  2369. blklocs[mgr_locs.indexer], axis=0,
  2370. new_mgr_locs=mgr_locs, fill_tuple=None))
  2371. return blocks
  2372. def _make_na_block(self, placement, fill_value=None):
  2373. # TODO: infer dtypes other than float64 from fill_value
  2374. if fill_value is None:
  2375. fill_value = np.nan
  2376. block_shape = list(self.shape)
  2377. block_shape[0] = len(placement)
  2378. dtype, fill_value = com._infer_dtype_from_scalar(fill_value)
  2379. block_values = np.empty(block_shape, dtype=dtype)
  2380. block_values.fill(fill_value)
  2381. return make_block(block_values, placement=placement)
  2382. def take(self, indexer, axis=1, verify=True, convert=True):
  2383. """
  2384. Take items along any axis.
  2385. """
  2386. self._consolidate_inplace()
  2387. indexer = np.asanyarray(indexer, dtype=np.int_)
  2388. n = self.shape[axis]
  2389. if convert:
  2390. indexer = _maybe_convert_indices(indexer, n)
  2391. if verify:
  2392. if ((indexer == -1) | (indexer >= n)).any():
  2393. raise Exception('Indices must be nonzero and less than '
  2394. 'the axis length')
  2395. new_labels = self.axes[axis].take(indexer)
  2396. return self.reindex_indexer(new_axis=new_labels, indexer=indexer,
  2397. axis=axis, allow_dups=True)
  2398. def merge(self, other, lsuffix='', rsuffix=''):
  2399. if not self._is_indexed_like(other):
  2400. raise AssertionError('Must have same axes to merge managers')
  2401. l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix,
  2402. right=other.items, rsuffix=rsuffix)
  2403. new_items = _concat_indexes([l, r])
  2404. new_blocks = [blk.copy(deep=False)
  2405. for blk in self.blocks]
  2406. offset = self.shape[0]
  2407. for blk in other.blocks:
  2408. blk = blk.copy(deep=False)
  2409. blk.mgr_locs = blk.mgr_locs.add(offset)
  2410. new_blocks.append(blk)
  2411. new_axes = list(self.axes)
  2412. new_axes[0] = new_items
  2413. return self.__class__(_consolidate(new_blocks), new_axes)
  2414. def _is_indexed_like(self, other):
  2415. """
  2416. Check all axes except items
  2417. """
  2418. if self.ndim != other.ndim:
  2419. raise AssertionError(('Number of dimensions must agree '
  2420. 'got %d and %d') % (self.ndim, other.ndim))
  2421. for ax, oax in zip(self.axes[1:], other.axes[1:]):
  2422. if not ax.equals(oax):
  2423. return False
  2424. return True
  2425. def equals(self, other):
  2426. self_axes, other_axes = self.axes, other.axes
  2427. if len(self_axes) != len(other_axes):
  2428. return False
  2429. if not all (ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
  2430. return False
  2431. self._consolidate_inplace()
  2432. other._consolidate_inplace()
  2433. return all(block.equals(oblock) for block, oblock in
  2434. zip(self.blocks, other.blocks))
  2435. class SingleBlockManager(BlockManager):
  2436. """ manage a single block with """
  2437. ndim = 1
  2438. _is_consolidated = True
  2439. _known_consolidated = True
  2440. __slots__ = ()
  2441. def __init__(self, block, axis, do_integrity_check=False, fastpath=False):
  2442. if isinstance(axis, list):
  2443. if len(axis) != 1:
  2444. raise ValueError(
  2445. "cannot create SingleBlockManager with more than 1 axis")
  2446. axis = axis[0]
  2447. # passed from constructor, single block, single axis
  2448. if fastpath:
  2449. self.axes = [axis]
  2450. if isinstance(block, list):
  2451. # empty block
  2452. if len(block) == 0:
  2453. block = [np.array([])]
  2454. elif len(block) != 1:
  2455. raise ValueError('Cannot create SingleBlockManager with '
  2456. 'more than 1 block')
  2457. block = block[0]
  2458. else:
  2459. self.axes = [_ensure_index(axis)]
  2460. # create the block here
  2461. if isinstance(block, list):
  2462. # provide consolidation to the interleaved_dtype
  2463. if len(block) > 1:
  2464. dtype = _interleaved_dtype(block)
  2465. block = [b.astype(dtype) for b in block]
  2466. block = _consolidate(block)
  2467. if len(block) != 1:
  2468. raise ValueError('Cannot create SingleBlockManager with '
  2469. 'more than 1 block')
  2470. block = block[0]
  2471. if not isinstance(block, Block):
  2472. block = make_block(block,
  2473. placement=slice(0, len(axis)),
  2474. ndim=1, fastpath=True)
  2475. self.blocks = [block]
  2476. def _post_setstate(self):
  2477. pass
  2478. @property
  2479. def _block(self):
  2480. return self.blocks[0]
  2481. @property
  2482. def _values(self):
  2483. return self._block.values
  2484. def reindex(self, new_axis, indexer=None, method=None, fill_value=None,
  2485. limit=None, copy=True):
  2486. # if we are the same and don't copy, just return
  2487. if self.index.equals(new_axis):
  2488. if copy:
  2489. return self.copy(deep=True)
  2490. else:
  2491. return self
  2492. values = self._block.get_values()
  2493. if indexer is None:
  2494. indexer = self.items.get_indexer_for(new_axis)
  2495. if fill_value is None:
  2496. # FIXME: is fill_value used correctly in sparse blocks?
  2497. if not self._block.is_sparse:
  2498. fill_value = self._block.fill_value
  2499. else:
  2500. fill_value = np.nan
  2501. new_values = com.take_1d(values, indexer,
  2502. fill_value=fill_value)
  2503. # fill if needed
  2504. if method is not None or limit is not None:
  2505. new_values = com.interpolate_2d(new_values, method=method,
  2506. limit=limit, fill_value=fill_value)
  2507. if self._block.is_sparse:
  2508. make_block = self._block.make_block_same_class
  2509. block = make_block(new_values, copy=copy,
  2510. placement=slice(0, len(new_axis)))
  2511. mgr = SingleBlockManager(block, new_axis)
  2512. mgr._consolidate_inplace()
  2513. return mgr
  2514. def get_slice(self, slobj, axis=0):
  2515. if axis >= self.ndim:
  2516. raise IndexError("Requested axis not found in manager")
  2517. return self.__class__(self._block._slice(slobj),
  2518. self.index[slobj], fastpath=True)
  2519. @property
  2520. def index(self):
  2521. return self.axes[0]
  2522. def convert(self, **kwargs):
  2523. """ convert the whole block as one """
  2524. kwargs['by_item'] = False
  2525. return self.apply('convert', **kwargs)
  2526. @property
  2527. def dtype(self):
  2528. return self._values.dtype
  2529. @property
  2530. def ftype(self):
  2531. return self._block.ftype
  2532. def get_dtype_counts(self):
  2533. return {self.dtype.name: 1}
  2534. def get_ftype_counts(self):
  2535. return {self.ftype: 1}
  2536. def get_dtypes(self):
  2537. return np.array([self._block.dtype])
  2538. def get_ftypes(self):
  2539. return np.array([self._block.ftype])
  2540. @property
  2541. def values(self):
  2542. return self._values.view()
  2543. @property
  2544. def itemsize(self):
  2545. return self._values.itemsize
  2546. @property
  2547. def _can_hold_na(self):
  2548. return self._block._can_hold_na
  2549. def is_consolidated(self):
  2550. return True
  2551. def _consolidate_check(self):
  2552. pass
  2553. def _consolidate_inplace(self):
  2554. pass
  2555. def delete(self, item):
  2556. """
  2557. Delete single item from SingleBlockManager.
  2558. Ensures that self.blocks doesn't become empty.
  2559. """
  2560. loc = self.items.get_loc(item)
  2561. self._block.delete(loc)
  2562. self.axes[0] = self.axes[0].delete(loc)
  2563. def fast_xs(self, loc):
  2564. """
  2565. fast path for getting a cross-section
  2566. return a view of the data
  2567. """
  2568. return self._block.values[loc]
  2569. def construction_error(tot_items, block_shape, axes, e=None):
  2570. """ raise a helpful message about our construction """
  2571. passed = tuple(map(int, [tot_items] + list(block_shape)))
  2572. implied = tuple(map(int, [len(ax) for ax in axes]))
  2573. if passed == implied and e is not None:
  2574. raise e
  2575. raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
  2576. passed,implied))
  2577. def create_block_manager_from_blocks(blocks, axes):
  2578. try:
  2579. if len(blocks) == 1 and not isinstance(blocks[0], Block):
  2580. # It's OK if a single block is passed as values, its placement is
  2581. # basically "all items", but if there're many, don't bother
  2582. # converting, it's an error anyway.
  2583. blocks = [make_block(values=blocks[0],
  2584. placement=slice(0, len(axes[0])))]
  2585. mgr = BlockManager(blocks, axes)
  2586. mgr._consolidate_inplace()
  2587. return mgr
  2588. except (ValueError) as e:
  2589. blocks = [getattr(b, 'values', b) for b in blocks]
  2590. tot_items = sum(b.shape[0] for b in blocks)
  2591. construction_error(tot_items, blocks[0].shape[1:], axes, e)
  2592. def create_block_manager_from_arrays(arrays, names, axes):
  2593. try:
  2594. blocks = form_blocks(arrays, names, axes)
  2595. mgr = BlockManager(blocks, axes)
  2596. mgr._consolidate_inplace()
  2597. return mgr
  2598. except (ValueError) as e:
  2599. construction_error(len(arrays), arrays[0].shape[1:], axes, e)
  2600. def form_blocks(arrays, names, axes):
  2601. # put "leftover" items in float bucket, where else?
  2602. # generalize?
  2603. float_items = []
  2604. complex_items = []
  2605. int_items = []
  2606. bool_items = []
  2607. object_items = []
  2608. sparse_items = []
  2609. datetime_items = []
  2610. extra_locs = []
  2611. names_idx = Index(names)
  2612. if names_idx.equals(axes[0]):
  2613. names_indexer = np.arange(len(names_idx))
  2614. else:
  2615. assert names_idx.intersection(axes[0]).is_unique
  2616. names_indexer = names_idx.get_indexer_for(axes[0])
  2617. for i, name_idx in enumerate(names_indexer):
  2618. if name_idx == -1:
  2619. extra_locs.append(i)
  2620. continue
  2621. k = names[name_idx]
  2622. v = arrays[name_idx]
  2623. if isinstance(v, (SparseArray, ABCSparseSeries)):
  2624. sparse_items.append((i, k, v))
  2625. elif issubclass(v.dtype.type, np.floating):
  2626. float_items.append((i, k, v))
  2627. elif issubclass(v.dtype.type, np.complexfloating):
  2628. complex_items.append((i, k, v))
  2629. elif issubclass(v.dtype.type, np.datetime64):
  2630. if v.dtype != _NS_DTYPE:
  2631. v = tslib.cast_to_nanoseconds(v)
  2632. if hasattr(v, 'tz') and v.tz is not None:
  2633. object_items.append((i, k, v))
  2634. else:
  2635. datetime_items.append((i, k, v))
  2636. elif issubclass(v.dtype.type, np.integer):
  2637. if v.dtype == np.uint64:
  2638. # HACK #2355 definite overflow
  2639. if (v > 2 ** 63 - 1).any():
  2640. object_items.append((i, k, v))
  2641. continue
  2642. int_items.append((i, k, v))
  2643. elif v.dtype == np.bool_:
  2644. bool_items.append((i, k, v))
  2645. else:
  2646. object_items.append((i, k, v))
  2647. blocks = []
  2648. if len(float_items):
  2649. float_blocks = _multi_blockify(float_items)
  2650. blocks.extend(float_blocks)
  2651. if len(complex_items):
  2652. complex_blocks = _simple_blockify(
  2653. complex_items, np.complex128)
  2654. blocks.extend(complex_blocks)
  2655. if len(int_items):
  2656. int_blocks = _multi_blockify(int_items)
  2657. blocks.extend(int_blocks)
  2658. if len(datetime_items):
  2659. datetime_blocks = _simple_blockify(
  2660. datetime_items, _NS_DTYPE)
  2661. blocks.extend(datetime_blocks)
  2662. if len(bool_items):
  2663. bool_blocks = _simple_blockify(
  2664. bool_items, np.bool_)
  2665. blocks.extend(bool_blocks)
  2666. if len(object_items) > 0:
  2667. object_blocks = _simple_blockify(
  2668. object_items, np.object_)
  2669. blocks.extend(object_blocks)
  2670. if len(sparse_items) > 0:
  2671. sparse_blocks = _sparse_blockify(sparse_items)
  2672. blocks.extend(sparse_blocks)
  2673. if len(extra_locs):
  2674. shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:])
  2675. # empty items -> dtype object
  2676. block_values = np.empty(shape, dtype=object)
  2677. block_values.fill(np.nan)
  2678. na_block = make_block(block_values, placement=extra_locs)
  2679. blocks.append(na_block)
  2680. return blocks
  2681. def _simple_blockify(tuples, dtype):
  2682. """ return a single array of a block that has a single dtype; if dtype is
  2683. not None, coerce to this dtype
  2684. """
  2685. values, placement = _stack_arrays(tuples, dtype)
  2686. # CHECK DTYPE?
  2687. if dtype is not None and values.dtype != dtype: # pragma: no cover
  2688. values = values.astype(dtype)
  2689. block = make_block(values, placement=placement)
  2690. return [block]
  2691. def _multi_blockify(tuples, dtype=None):
  2692. """ return an array of blocks that potentially have different dtypes """
  2693. # group by dtype
  2694. grouper = itertools.groupby(tuples, lambda x: x[2].dtype)
  2695. new_blocks = []
  2696. for dtype, tup_block in grouper:
  2697. values, placement = _stack_arrays(
  2698. list(tup_block), dtype)
  2699. block = make_block(values, placement=placement)
  2700. new_blocks.append(block)
  2701. return new_blocks
  2702. def _sparse_blockify(tuples, dtype=None):
  2703. """ return an array of blocks that potentially have different dtypes (and
  2704. are sparse)
  2705. """
  2706. new_blocks = []
  2707. for i, names, array in tuples:
  2708. array = _maybe_to_sparse(array)
  2709. block = make_block(
  2710. array, klass=SparseBlock, fastpath=True,
  2711. placement=[i])
  2712. new_blocks.append(block)
  2713. return new_blocks
  2714. def _stack_arrays(tuples, dtype):
  2715. # fml
  2716. def _asarray_compat(x):
  2717. if isinstance(x, ABCSeries):
  2718. return x.values
  2719. else:
  2720. return np.asarray(x)
  2721. def _shape_compat(x):
  2722. if isinstance(x, ABCSeries):
  2723. return len(x),
  2724. else:
  2725. return x.shape
  2726. placement, names, arrays = zip(*tuples)
  2727. first = arrays[0]
  2728. shape = (len(arrays),) + _shape_compat(first)
  2729. stacked = np.empty(shape, dtype=dtype)
  2730. for i, arr in enumerate(arrays):
  2731. stacked[i] = _asarray_compat(arr)
  2732. return stacked, placement
  2733. def _interleaved_dtype(blocks):
  2734. if not len(blocks):
  2735. return None
  2736. counts = defaultdict(lambda: [])
  2737. for x in blocks:
  2738. counts[type(x)].append(x)
  2739. def _lcd_dtype(l):
  2740. """ find the lowest dtype that can accomodate the given types """
  2741. m = l[0].dtype
  2742. for x in l[1:]:
  2743. if x.dtype.itemsize > m.itemsize:
  2744. m = x.dtype
  2745. return m
  2746. have_int = len(counts[IntBlock]) > 0
  2747. have_bool = len(counts[BoolBlock]) > 0
  2748. have_object = len(counts[ObjectBlock]) > 0
  2749. have_float = len(counts[FloatBlock]) > 0
  2750. have_complex = len(counts[ComplexBlock]) > 0
  2751. have_dt64 = len(counts[DatetimeBlock]) > 0
  2752. have_td64 = len(counts[TimeDeltaBlock]) > 0
  2753. have_sparse = len(counts[SparseBlock]) > 0
  2754. have_numeric = have_float or have_complex or have_int
  2755. if (have_object or
  2756. (have_bool and have_numeric) or
  2757. (have_numeric and (have_dt64 or have_td64))):
  2758. return np.dtype(object)
  2759. elif have_bool:
  2760. return np.dtype(bool)
  2761. elif have_int and not have_float and not have_complex:
  2762. # if we are mixing unsigned and signed, then return
  2763. # the next biggest int type (if we can)
  2764. lcd = _lcd_dtype(counts[IntBlock])
  2765. kinds = set([i.dtype.kind for i in counts[IntBlock]])
  2766. if len(kinds) == 1:
  2767. return lcd
  2768. if lcd == 'uint64' or lcd == 'int64':
  2769. return np.dtype('int64')
  2770. # return 1 bigger on the itemsize if unsinged
  2771. if lcd.kind == 'u':
  2772. return np.dtype('int%s' % (lcd.itemsize * 8 * 2))
  2773. return lcd
  2774. elif have_dt64 and not have_float and not have_complex:
  2775. return np.dtype('M8[ns]')
  2776. elif have_td64 and not have_float and not have_complex:
  2777. return np.dtype('m8[ns]')
  2778. elif have_complex:
  2779. return np.dtype('c16')
  2780. else:
  2781. return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock])
  2782. def _consolidate(blocks):
  2783. """
  2784. Merge blocks having same dtype, exclude non-consolidating blocks
  2785. """
  2786. # sort by _can_consolidate, dtype
  2787. gkey = lambda x: x._consolidate_key
  2788. grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
  2789. new_blocks = []
  2790. for (_can_consolidate, dtype), group_blocks in grouper:
  2791. merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype,
  2792. _can_consolidate=_can_consolidate)
  2793. if isinstance(merged_blocks, list):
  2794. new_blocks.extend(merged_blocks)
  2795. else:
  2796. new_blocks.append(merged_blocks)
  2797. return new_blocks
  2798. def _merge_blocks(blocks, dtype=None, _can_consolidate=True):
  2799. if len(blocks) == 1:
  2800. return blocks[0]
  2801. if _can_consolidate:
  2802. if dtype is None:
  2803. if len(set([b.dtype for b in blocks])) != 1:
  2804. raise AssertionError("_merge_blocks are invalid!")
  2805. dtype = blocks[0].dtype
  2806. # FIXME: optimization potential in case all mgrs contain slices and
  2807. # combination of those slices is a slice, too.
  2808. new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
  2809. new_values = _vstack([b.values for b in blocks], dtype)
  2810. argsort = np.argsort(new_mgr_locs)
  2811. new_values = new_values[argsort]
  2812. new_mgr_locs = new_mgr_locs[argsort]
  2813. return make_block(new_values,
  2814. fastpath=True, placement=new_mgr_locs)
  2815. # no merge
  2816. return blocks
  2817. def _block_shape(values, ndim=1, shape=None):
  2818. """ guarantee the shape of the values to be at least 1 d """
  2819. if values.ndim <= ndim:
  2820. if shape is None:
  2821. shape = values.shape
  2822. values = values.reshape(tuple((1,) + shape))
  2823. return values
  2824. def _vstack(to_stack, dtype):
  2825. # work around NumPy 1.6 bug
  2826. if dtype == _NS_DTYPE or dtype == _TD_DTYPE:
  2827. new_values = np.vstack([x.view('i8') for x in to_stack])
  2828. return new_values.view(dtype)
  2829. else:
  2830. return np.vstack(to_stack)
  2831. def _possibly_compare(a, b, op):
  2832. res = op(a, b)
  2833. is_a_array = isinstance(a, np.ndarray)
  2834. is_b_array = isinstance(b, np.ndarray)
  2835. if np.isscalar(res) and (is_a_array or is_b_array):
  2836. type_names = [type(a).__name__, type(b).__name__]
  2837. if is_a_array:
  2838. type_names[0] = 'ndarray(dtype=%s)' % a.dtype
  2839. if is_b_array:
  2840. type_names[1] = 'ndarray(dtype=%s)' % b.dtype
  2841. raise TypeError("Cannot compare types %r and %r" % tuple(type_names))
  2842. return res
  2843. def _concat_indexes(indexes):
  2844. return indexes[0].append(indexes[1:])
  2845. def _get_blkno_placements(blknos, blk_count, group=True):
  2846. """
  2847. Parameters
  2848. ----------
  2849. blknos : array of int64
  2850. blk_count : int
  2851. group : bool
  2852. Returns
  2853. -------
  2854. iterator
  2855. yield (BlockPlacement, blkno)
  2856. """
  2857. blknos = com._ensure_int64(blknos)
  2858. # FIXME: blk_count is unused, but it may avoid the use of dicts in cython
  2859. for blkno, indexer in lib.get_blkno_indexers(blknos, group):
  2860. yield blkno, BlockPlacement(indexer)
  2861. def items_overlap_with_suffix(left, lsuffix, right, rsuffix):
  2862. """
  2863. If two indices overlap, add suffixes to overlapping entries.
  2864. If corresponding suffix is empty, the entry is simply converted to string.
  2865. """
  2866. to_rename = left.intersection(right)
  2867. if len(to_rename) == 0:
  2868. return left, right
  2869. else:
  2870. if not lsuffix and not rsuffix:
  2871. raise ValueError('columns overlap but no suffix specified: %s' %
  2872. to_rename)
  2873. def lrenamer(x):
  2874. if x in to_rename:
  2875. return '%s%s' % (x, lsuffix)
  2876. return x
  2877. def rrenamer(x):
  2878. if x in to_rename:
  2879. return '%s%s' % (x, rsuffix)
  2880. return x
  2881. return (_transform_index(left, lrenamer),
  2882. _transform_index(right, rrenamer))
  2883. def _transform_index(index, func):
  2884. """
  2885. Apply function to all values found in index.
  2886. This includes transforming multiindex entries separately.
  2887. """
  2888. if isinstance(index, MultiIndex):
  2889. items = [tuple(func(y) for y in x) for x in index]
  2890. return MultiIndex.from_tuples(items, names=index.names)
  2891. else:
  2892. items = [func(x) for x in index]
  2893. return Index(items, name=index.name)
  2894. def _putmask_smart(v, m, n):
  2895. """
  2896. Return a new block, try to preserve dtype if possible.
  2897. Parameters
  2898. ----------
  2899. v : array_like
  2900. m : array_like
  2901. n : array_like
  2902. """
  2903. # n should be the length of the mask or a scalar here
  2904. if not is_list_like(n):
  2905. n = np.array([n] * len(m))
  2906. # see if we are only masking values that if putted
  2907. # will work in the current dtype
  2908. try:
  2909. nn = n[m]
  2910. nn_at = nn.astype(v.dtype)
  2911. if (nn == nn_at).all():
  2912. nv = v.copy()
  2913. nv[m] = nn_at
  2914. return nv
  2915. except (ValueError, IndexError, TypeError):
  2916. pass
  2917. # change the dtype
  2918. dtype, _ = com._maybe_promote(n.dtype)
  2919. nv = v.astype(dtype)
  2920. try:
  2921. nv[m] = n
  2922. except ValueError:
  2923. idx, = np.where(np.squeeze(m))
  2924. for mask_index, new_val in zip(idx, n):
  2925. nv[mask_index] = new_val
  2926. return nv
  2927. def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
  2928. """
  2929. Concatenate block managers into one.
  2930. Parameters
  2931. ----------
  2932. mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
  2933. axes : list of Index
  2934. concat_axis : int
  2935. copy : bool
  2936. """
  2937. concat_plan = combine_concat_plans([get_mgr_concatenation_plan(mgr, indexers)
  2938. for mgr, indexers in mgrs_indexers],
  2939. concat_axis)
  2940. blocks = [make_block(concatenate_join_units(join_units, concat_axis,
  2941. copy=copy),
  2942. placement=placement)
  2943. for placement, join_units in concat_plan]
  2944. return BlockManager(blocks, axes)
  2945. def get_empty_dtype_and_na(join_units):
  2946. """
  2947. Return dtype and N/A values to use when concatenating specified units.
  2948. Returned N/A value may be None which means there was no casting involved.
  2949. Returns
  2950. -------
  2951. dtype
  2952. na
  2953. """
  2954. if len(join_units) == 1:
  2955. blk = join_units[0].block
  2956. if blk is None:
  2957. return np.float64, np.nan
  2958. else:
  2959. return blk.dtype, None
  2960. has_none_blocks = False
  2961. dtypes = [None] * len(join_units)
  2962. for i, unit in enumerate(join_units):
  2963. if unit.block is None:
  2964. has_none_blocks = True
  2965. else:
  2966. dtypes[i] = unit.dtype
  2967. if not has_none_blocks and len(set(dtypes)) == 1:
  2968. # Unanimous decision, nothing to upcast.
  2969. return dtypes[0], None
  2970. # dtypes = set()
  2971. upcast_classes = set()
  2972. null_upcast_classes = set()
  2973. for dtype, unit in zip(dtypes, join_units):
  2974. if dtype is None:
  2975. continue
  2976. if issubclass(dtype.type, (np.object_, np.bool_)):
  2977. upcast_cls = 'object'
  2978. elif is_datetime64_dtype(dtype):
  2979. upcast_cls = 'datetime'
  2980. elif is_timedelta64_dtype(dtype):
  2981. upcast_cls = 'timedelta'
  2982. else:
  2983. upcast_cls = 'float'
  2984. # Null blocks should not influence upcast class selection, unless there
  2985. # are only null blocks, when same upcasting rules must be applied to
  2986. # null upcast classes.
  2987. if unit.is_null:
  2988. null_upcast_classes.add(upcast_cls)
  2989. else:
  2990. upcast_classes.add(upcast_cls)
  2991. if not upcast_classes:
  2992. upcast_classes = null_upcast_classes
  2993. # create the result
  2994. if 'object' in upcast_classes:
  2995. return np.dtype(np.object_), np.nan
  2996. elif 'float' in upcast_classes:
  2997. return np.dtype(np.float64), np.nan
  2998. elif 'datetime' in upcast_classes:
  2999. return np.dtype('M8[ns]'), tslib.iNaT
  3000. elif 'timedelta' in upcast_classes:
  3001. return np.dtype('m8[ns]'), tslib.iNaT
  3002. else: # pragma
  3003. raise AssertionError("invalid dtype determination in get_concat_dtype")
  3004. def concatenate_join_units(join_units, concat_axis, copy):
  3005. """
  3006. Concatenate values from several join units along selected axis.
  3007. """
  3008. if concat_axis == 0 and len(join_units) > 1:
  3009. # Concatenating join units along ax0 is handled in _merge_blocks.
  3010. raise AssertionError("Concatenating join units along axis0")
  3011. empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units)
  3012. to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype,
  3013. upcasted_na=upcasted_na)
  3014. for ju in join_units]
  3015. if len(to_concat) == 1:
  3016. # Only one block, nothing to concatenate.
  3017. concat_values = to_concat[0]
  3018. if copy and concat_values.base is not None:
  3019. concat_values = concat_values.copy()
  3020. else:
  3021. concat_values = com._concat_compat(to_concat, axis=concat_axis)
  3022. # FIXME: optimization potential: if len(join_units) == 1, single join unit
  3023. # is densified and sparsified back.
  3024. if any(unit.is_sparse for unit in join_units):
  3025. # If one of the units was sparse, concat_values are 2d and there's only
  3026. # one item.
  3027. return SparseArray(concat_values[0])
  3028. else:
  3029. return concat_values
  3030. def get_mgr_concatenation_plan(mgr, indexers):
  3031. """
  3032. Construct concatenation plan for given block manager and indexers.
  3033. Parameters
  3034. ----------
  3035. mgr : BlockManager
  3036. indexers : dict of {axis: indexer}
  3037. Returns
  3038. -------
  3039. plan : list of (BlockPlacement, JoinUnit) tuples
  3040. """
  3041. # Calculate post-reindex shape , save for item axis which will be separate
  3042. # for each block anyway.
  3043. mgr_shape = list(mgr.shape)
  3044. for ax, indexer in indexers.items():
  3045. mgr_shape[ax] = len(indexer)
  3046. mgr_shape = tuple(mgr_shape)
  3047. if 0 in indexers:
  3048. ax0_indexer = indexers.pop(0)
  3049. blknos = com.take_1d(mgr._blknos, ax0_indexer, fill_value=-1)
  3050. blklocs = com.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1)
  3051. else:
  3052. if mgr._is_single_block:
  3053. blk = mgr.blocks[0]
  3054. return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
  3055. ax0_indexer = None
  3056. blknos = mgr._blknos
  3057. blklocs = mgr._blklocs
  3058. plan = []
  3059. for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks),
  3060. group=False):
  3061. assert placements.is_slice_like
  3062. join_unit_indexers = indexers.copy()
  3063. shape = list(mgr_shape)
  3064. shape[0] = len(placements)
  3065. shape = tuple(shape)
  3066. if blkno == -1:
  3067. unit = JoinUnit(None, shape)
  3068. else:
  3069. blk = mgr.blocks[blkno]
  3070. ax0_blk_indexer = blklocs[placements.indexer]
  3071. unit_no_ax0_reindexing = (
  3072. len(placements) == len(blk.mgr_locs) and
  3073. # Fastpath detection of join unit not needing to reindex its
  3074. # block: no ax0 reindexing took place and block placement was
  3075. # sequential before.
  3076. ((ax0_indexer is None
  3077. and blk.mgr_locs.is_slice_like
  3078. and blk.mgr_locs.as_slice.step == 1) or
  3079. # Slow-ish detection: all indexer locs are sequential (and
  3080. # length match is checked above).
  3081. (np.diff(ax0_blk_indexer) == 1).all()))
  3082. # Omit indexer if no item reindexing is required.
  3083. if unit_no_ax0_reindexing:
  3084. join_unit_indexers.pop(0, None)
  3085. else:
  3086. join_unit_indexers[0] = ax0_blk_indexer
  3087. unit = JoinUnit(blk, shape, join_unit_indexers)
  3088. plan.append((placements, unit))
  3089. return plan
  3090. def combine_concat_plans(plans, concat_axis):
  3091. """
  3092. Combine multiple concatenation plans into one.
  3093. existing_plan is updated in-place.
  3094. """
  3095. if len(plans) == 1:
  3096. for p in plans[0]:
  3097. yield p[0], [p[1]]
  3098. elif concat_axis == 0:
  3099. offset = 0
  3100. for plan in plans:
  3101. last_plc = None
  3102. for plc, unit in plan:
  3103. yield plc.add(offset), [unit]
  3104. last_plc = plc
  3105. if last_plc is not None:
  3106. offset += last_plc.as_slice.stop
  3107. else:
  3108. num_ended = [0]
  3109. def _next_or_none(seq):
  3110. retval = next(seq, None)
  3111. if retval is None:
  3112. num_ended[0] += 1
  3113. return retval
  3114. plans = list(map(iter, plans))
  3115. next_items = list(map(_next_or_none, plans))
  3116. while num_ended[0] != len(next_items):
  3117. if num_ended[0] > 0:
  3118. raise ValueError("Plan shapes are not aligned")
  3119. placements, units = zip(*next_items)
  3120. lengths = list(map(len, placements))
  3121. min_len, max_len = min(lengths), max(lengths)
  3122. if min_len == max_len:
  3123. yield placements[0], units
  3124. next_items[:] = map(_next_or_none, plans)
  3125. else:
  3126. yielded_placement = None
  3127. yielded_units = [None] * len(next_items)
  3128. for i, (plc, unit) in enumerate(next_items):
  3129. yielded_units[i] = unit
  3130. if len(plc) > min_len:
  3131. # trim_join_unit updates unit in place, so only
  3132. # placement needs to be sliced to skip min_len.
  3133. next_items[i] = (plc[min_len:],
  3134. trim_join_unit(unit, min_len))
  3135. else:
  3136. yielded_placement = plc
  3137. next_items[i] = _next_or_none(plans[i])
  3138. yield yielded_placement, yielded_units
  3139. def trim_join_unit(join_unit, length):
  3140. """
  3141. Reduce join_unit's shape along item axis to length.
  3142. Extra items that didn't fit are returned as a separate block.
  3143. """
  3144. if 0 not in join_unit.indexers:
  3145. extra_indexers = join_unit.indexers
  3146. if join_unit.block is None:
  3147. extra_block = None
  3148. else:
  3149. extra_block = join_unit.block.getitem_block(slice(length, None))
  3150. join_unit.block = join_unit.block.getitem_block(slice(length))
  3151. else:
  3152. extra_block = join_unit.block
  3153. extra_indexers = copy.copy(join_unit.indexers)
  3154. extra_indexers[0] = extra_indexers[0][length:]
  3155. join_unit.indexers[0] = join_unit.indexers[0][:length]
  3156. extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
  3157. join_unit.shape = (length,) + join_unit.shape[1:]
  3158. return JoinUnit(block=extra_block, indexers=extra_indexers,
  3159. shape=extra_shape)
  3160. class JoinUnit(object):
  3161. def __init__(self, block, shape, indexers={}):
  3162. # Passing shape explicitly is required for cases when block is None.
  3163. self.block = block
  3164. self.indexers = indexers
  3165. self.shape = shape
  3166. def __repr__(self):
  3167. return '%s(%r, %s)' % (self.__class__.__name__,
  3168. self.block, self.indexers)
  3169. @cache_readonly
  3170. def needs_filling(self):
  3171. for indexer in self.indexers.values():
  3172. # FIXME: cache results of indexer == -1 checks.
  3173. if (indexer == -1).any():
  3174. return True
  3175. return False
  3176. @cache_readonly
  3177. def dtype(self):
  3178. if self.block is None:
  3179. raise AssertionError("Block is None, no dtype")
  3180. if not self.needs_filling:
  3181. return self.block.dtype
  3182. else:
  3183. return np.dtype(com._maybe_promote(self.block.dtype,
  3184. self.block.fill_value)[0])
  3185. return self._dtype
  3186. @cache_readonly
  3187. def is_null(self):
  3188. if self.block is None:
  3189. return True
  3190. if not self.block._can_hold_na:
  3191. return False
  3192. # Usually it's enough to check but a small fraction of values to see if
  3193. # a block is NOT null, chunks should help in such cases. 1000 value
  3194. # was chosen rather arbitrarily.
  3195. values_flat = self.block.values.ravel()
  3196. total_len = values_flat.shape[0]
  3197. chunk_len = max(total_len // 40, 1000)
  3198. for i in range(0, total_len, chunk_len):
  3199. if not isnull(values_flat[i: i + chunk_len]).all():
  3200. return False
  3201. return True
  3202. @cache_readonly
  3203. def is_sparse(self):
  3204. return self.block is not None and self.block.is_sparse
  3205. def get_reindexed_values(self, empty_dtype, upcasted_na):
  3206. if upcasted_na is None:
  3207. # No upcasting is necessary
  3208. fill_value = self.block.fill_value
  3209. values = self.block.get_values()
  3210. else:
  3211. fill_value = upcasted_na
  3212. if self.is_null:
  3213. missing_arr = np.empty(self.shape, dtype=empty_dtype)
  3214. if np.prod(self.shape):
  3215. # NumPy 1.6 workaround: this statement gets strange if all
  3216. # blocks are of same dtype and some of them are empty:
  3217. # empty one are considered "null" so they must be filled,
  3218. # but no dtype upcasting happens and the dtype may not
  3219. # allow NaNs.
  3220. #
  3221. # In general, no one should get hurt when one tries to put
  3222. # incorrect values into empty array, but numpy 1.6 is
  3223. # strict about that.
  3224. missing_arr.fill(fill_value)
  3225. return missing_arr
  3226. if self.block.is_bool:
  3227. # External code requested filling/upcasting, bool values must
  3228. # be upcasted to object to avoid being upcasted to numeric.
  3229. values = self.block.astype(np.object_).values
  3230. else:
  3231. # No dtype upcasting is done here, it will be performed during
  3232. # concatenation itself.
  3233. values = self.block.get_values()
  3234. if not self.indexers:
  3235. # If there's no indexing to be done, we want to signal outside
  3236. # code that this array must be copied explicitly. This is done
  3237. # by returning a view and checking `retval.base`.
  3238. return values.view()
  3239. else:
  3240. for ax, indexer in self.indexers.items():
  3241. values = com.take_nd(values, indexer, axis=ax,
  3242. fill_value=fill_value)
  3243. return values
  3244. def _fast_count_smallints(arr):
  3245. """Faster version of set(arr) for sequences of small numbers."""
  3246. if len(arr) == 0:
  3247. # Handle empty arr case separately: numpy 1.6 chokes on that.
  3248. return np.empty((0, 2), dtype=arr.dtype)
  3249. else:
  3250. counts = np.bincount(arr.astype(np.int_))
  3251. nz = counts.nonzero()[0]
  3252. return np.c_[nz, counts[nz]]
  3253. def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill):
  3254. if isinstance(slice_or_indexer, slice):
  3255. return 'slice', slice_or_indexer, lib.slice_len(slice_or_indexer,
  3256. length)
  3257. elif (isinstance(slice_or_indexer, np.ndarray) and
  3258. slice_or_indexer.dtype == np.bool_):
  3259. return 'mask', slice_or_indexer, slice_or_indexer.sum()
  3260. else:
  3261. indexer = np.asanyarray(slice_or_indexer, dtype=np.int64)
  3262. if not allow_fill:
  3263. indexer = _maybe_convert_indices(indexer, length)
  3264. return 'fancy', indexer, len(indexer)