/pandas/core/internals.py
Python | 4069 lines | 3422 code | 325 blank | 322 comment | 311 complexity | ea364a4280ea861ebe94fc70579f020b MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- import copy
- import itertools
- import re
- import operator
- from datetime import datetime, timedelta
- from collections import defaultdict
- import numpy as np
- from pandas.core.base import PandasObject
- from pandas.core.common import (_possibly_downcast_to_dtype, isnull,
- _NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like,
- ABCSparseSeries, _infer_dtype_from_scalar,
- _is_null_datelike_scalar,
- is_timedelta64_dtype, is_datetime64_dtype,
- _possibly_infer_to_datetimelike)
- from pandas.core.index import Index, MultiIndex, _ensure_index
- from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer)
- import pandas.core.common as com
- from pandas.sparse.array import _maybe_to_sparse, SparseArray
- import pandas.lib as lib
- import pandas.tslib as tslib
- import pandas.computation.expressions as expressions
- from pandas.util.decorators import cache_readonly
- from pandas.tslib import Timestamp
- from pandas import compat
- from pandas.compat import range, map, zip, u
- from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type
- from pandas.lib import BlockPlacement
- class Block(PandasObject):
- """
- Canonical n-dimensional unit of homogeneous dtype contained in a pandas
- data structure
- Index-ignorant; let the container take care of that
- """
- __slots__ = ['_mgr_locs', 'values', 'ndim']
- is_numeric = False
- is_float = False
- is_integer = False
- is_complex = False
- is_datetime = False
- is_timedelta = False
- is_bool = False
- is_object = False
- is_sparse = False
- _can_hold_na = False
- _downcast_dtype = None
- _can_consolidate = True
- _verify_integrity = True
- _ftype = 'dense'
- def __init__(self, values, placement, ndim=None, fastpath=False):
- if ndim is None:
- ndim = values.ndim
- elif values.ndim != ndim:
- raise ValueError('Wrong number of dimensions')
- self.ndim = ndim
- self.mgr_locs = placement
- self.values = values
- if len(self.mgr_locs) != len(self.values):
- raise ValueError('Wrong number of items passed %d,'
- ' placement implies %d' % (
- len(self.values), len(self.mgr_locs)))
- @property
- def _consolidate_key(self):
- return (self._can_consolidate, self.dtype.name)
- @property
- def _is_single_block(self):
- return self.ndim == 1
- @property
- def is_datelike(self):
- """ return True if I am a non-datelike """
- return self.is_datetime or self.is_timedelta
- @property
- def fill_value(self):
- return np.nan
- @property
- def mgr_locs(self):
- return self._mgr_locs
- def make_block_same_class(self, values, placement, copy=False,
- **kwargs):
- """
- Wrap given values in a block of same type as self.
- `kwargs` are used in SparseBlock override.
- """
- if copy:
- values = values.copy()
- return make_block(values, placement, klass=self.__class__,
- fastpath=True)
- @mgr_locs.setter
- def mgr_locs(self, new_mgr_locs):
- if not isinstance(new_mgr_locs, BlockPlacement):
- new_mgr_locs = BlockPlacement(new_mgr_locs)
- self._mgr_locs = new_mgr_locs
- def __unicode__(self):
- # don't want to print out all of the items here
- name = com.pprint_thing(self.__class__.__name__)
- if self._is_single_block:
- result = '%s: %s dtype: %s' % (
- name, len(self), self.dtype)
- else:
- shape = ' x '.join([com.pprint_thing(s) for s in self.shape])
- result = '%s: %s, %s, dtype: %s' % (
- name, com.pprint_thing(self.mgr_locs.indexer), shape,
- self.dtype)
- return result
- def __len__(self):
- return len(self.values)
- def __getstate__(self):
- return self.mgr_locs.indexer, self.values
- def __setstate__(self, state):
- self.mgr_locs = BlockPlacement(state[0])
- self.values = state[1]
- self.ndim = self.values.ndim
- def _slice(self, slicer):
- """ return a slice of my values """
- return self.values[slicer]
- def getitem_block(self, slicer, new_mgr_locs=None):
- """
- Perform __getitem__-like, return result as block.
- As of now, only supports slices that preserve dimensionality.
- """
- if new_mgr_locs is None:
- if isinstance(slicer, tuple):
- axis0_slicer = slicer[0]
- else:
- axis0_slicer = slicer
- new_mgr_locs = self.mgr_locs[axis0_slicer]
- new_values = self._slice(slicer)
- if new_values.ndim != self.ndim:
- raise ValueError("Only same dim slicing is allowed")
- return self.make_block_same_class(new_values, new_mgr_locs)
- @property
- def shape(self):
- return self.values.shape
- @property
- def itemsize(self):
- return self.values.itemsize
- @property
- def dtype(self):
- return self.values.dtype
- @property
- def ftype(self):
- return "%s:%s" % (self.dtype, self._ftype)
- def merge(self, other):
- return _merge_blocks([self, other])
- def reindex_axis(self, indexer, method=None, axis=1, fill_value=None,
- limit=None, mask_info=None):
- """
- Reindex using pre-computed indexer information
- """
- if axis < 1:
- raise AssertionError('axis must be at least 1, got %d' % axis)
- if fill_value is None:
- fill_value = self.fill_value
- new_values = com.take_nd(self.values, indexer, axis,
- fill_value=fill_value, mask_info=mask_info)
- return make_block(new_values,
- ndim=self.ndim, fastpath=True,
- placement=self.mgr_locs)
- def get(self, item):
- loc = self.items.get_loc(item)
- return self.values[loc]
- def iget(self, i):
- return self.values[i]
- def set(self, locs, values, check=False):
- """
- Modify Block in-place with new item value
- Returns
- -------
- None
- """
- self.values[locs] = values
- def delete(self, loc):
- """
- Delete given loc(-s) from block in-place.
- """
- self.values = np.delete(self.values, loc, 0)
- self.mgr_locs = self.mgr_locs.delete(loc)
- def apply(self, func, **kwargs):
- """ apply the function to my values; return a block if we are not one """
- result = func(self.values)
- if not isinstance(result, Block):
- result = make_block(values=result, placement=self.mgr_locs,)
- return result
- def fillna(self, value, limit=None, inplace=False, downcast=None):
- if not self._can_hold_na:
- if inplace:
- return [self]
- else:
- return [self.copy()]
- mask = isnull(self.values)
- if limit is not None:
- if self.ndim > 2:
- raise NotImplementedError
- mask[mask.cumsum(self.ndim-1)>limit]=False
- value = self._try_fill(value)
- blocks = self.putmask(mask, value, inplace=inplace)
- return self._maybe_downcast(blocks, downcast)
- def _maybe_downcast(self, blocks, downcast=None):
- # no need to downcast our float
- # unless indicated
- if downcast is None and self.is_float:
- return blocks
- elif downcast is None and (self.is_timedelta or self.is_datetime):
- return blocks
- result_blocks = []
- for b in blocks:
- result_blocks.extend(b.downcast(downcast))
- return result_blocks
- def downcast(self, dtypes=None):
- """ try to downcast each item to the dict of dtypes if present """
- # turn it off completely
- if dtypes is False:
- return [self]
- values = self.values
- # single block handling
- if self._is_single_block:
- # try to cast all non-floats here
- if dtypes is None:
- dtypes = 'infer'
- nv = _possibly_downcast_to_dtype(values, dtypes)
- return [make_block(nv, ndim=self.ndim,
- fastpath=True, placement=self.mgr_locs)]
- # ndim > 1
- if dtypes is None:
- return [self]
- if not (dtypes == 'infer' or isinstance(dtypes, dict)):
- raise ValueError("downcast must have a dictionary or 'infer' as "
- "its argument")
- # item-by-item
- # this is expensive as it splits the blocks items-by-item
- blocks = []
- for i, rl in enumerate(self.mgr_locs):
- if dtypes == 'infer':
- dtype = 'infer'
- else:
- raise AssertionError("dtypes as dict is not supported yet")
- dtype = dtypes.get(item, self._downcast_dtype)
- if dtype is None:
- nv = _block_shape(values[i], ndim=self.ndim)
- else:
- nv = _possibly_downcast_to_dtype(values[i], dtype)
- nv = _block_shape(nv, ndim=self.ndim)
- blocks.append(make_block(nv,
- ndim=self.ndim, fastpath=True,
- placement=[rl]))
- return blocks
- def astype(self, dtype, copy=False, raise_on_error=True, values=None):
- return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
- values=values)
- def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
- klass=None):
- """
- Coerce to the new type (if copy=True, return a new copy)
- raise on an except if raise == True
- """
- dtype = np.dtype(dtype)
- if self.dtype == dtype:
- if copy:
- return self.copy()
- return self
- try:
- # force the copy here
- if values is None:
- # _astype_nansafe works fine with 1-d only
- values = com._astype_nansafe(self.values.ravel(), dtype, copy=True)
- values = values.reshape(self.values.shape)
- newb = make_block(values,
- ndim=self.ndim, placement=self.mgr_locs,
- fastpath=True, dtype=dtype, klass=klass)
- except:
- if raise_on_error is True:
- raise
- newb = self.copy() if copy else self
- if newb.is_numeric and self.is_numeric:
- if newb.shape != self.shape:
- raise TypeError("cannot set astype for copy = [%s] for dtype "
- "(%s [%s]) with smaller itemsize that current "
- "(%s [%s])" % (copy, self.dtype.name,
- self.itemsize, newb.dtype.name,
- newb.itemsize))
- return newb
- def convert(self, copy=True, **kwargs):
- """ attempt to coerce any object types to better types
- return a copy of the block (if copy = True)
- by definition we are not an ObjectBlock here! """
- return [self.copy()] if copy else [self]
- def _can_hold_element(self, value):
- raise NotImplementedError()
- def _try_cast(self, value):
- raise NotImplementedError()
- def _try_cast_result(self, result, dtype=None):
- """ try to cast the result to our original type,
- we may have roundtripped thru object in the mean-time """
- if dtype is None:
- dtype = self.dtype
- if self.is_integer or self.is_bool or self.is_datetime:
- pass
- elif self.is_float and result.dtype == self.dtype:
- # protect against a bool/object showing up here
- if isinstance(dtype, compat.string_types) and dtype == 'infer':
- return result
- if not isinstance(dtype, type):
- dtype = dtype.type
- if issubclass(dtype, (np.bool_, np.object_)):
- if issubclass(dtype, np.bool_):
- if isnull(result).all():
- return result.astype(np.bool_)
- else:
- result = result.astype(np.object_)
- result[result == 1] = True
- result[result == 0] = False
- return result
- else:
- return result.astype(np.object_)
- return result
- # may need to change the dtype here
- return _possibly_downcast_to_dtype(result, dtype)
- def _try_operate(self, values):
- """ return a version to operate on as the input """
- return values
- def _try_coerce_args(self, values, other):
- """ provide coercion to our input arguments """
- return values, other
- def _try_coerce_result(self, result):
- """ reverse of try_coerce_args """
- return result
- def _try_coerce_and_cast_result(self, result, dtype=None):
- result = self._try_coerce_result(result)
- result = self._try_cast_result(result, dtype=dtype)
- return result
- def _try_fill(self, value):
- return value
- def to_native_types(self, slicer=None, na_rep='', **kwargs):
- """ convert to our native types format, slicing if desired """
- values = self.values
- if slicer is not None:
- values = values[:, slicer]
- values = np.array(values, dtype=object)
- mask = isnull(values)
- values[mask] = na_rep
- return values.tolist()
- # block actions ####
- def copy(self, deep=True):
- values = self.values
- if deep:
- values = values.copy()
- return make_block(values, ndim=self.ndim,
- klass=self.__class__, fastpath=True,
- placement=self.mgr_locs)
- def replace(self, to_replace, value, inplace=False, filter=None,
- regex=False):
- """ replace the to_replace value with value, possible to create new
- blocks here this is just a call to putmask. regex is not used here.
- It is used in ObjectBlocks. It is here for API
- compatibility."""
- mask = com.mask_missing(self.values, to_replace)
- if filter is not None:
- filtered_out = ~self.mgr_locs.isin(filter)
- mask[filtered_out.nonzero()[0]] = False
- if not mask.any():
- if inplace:
- return [self]
- return [self.copy()]
- return self.putmask(mask, value, inplace=inplace)
- def setitem(self, indexer, value):
- """ set the value inplace; return a new block (of a possibly different
- dtype)
- indexer is a direct slice/positional indexer; value must be a
- compatible shape
- """
- # coerce args
- values, value = self._try_coerce_args(self.values, value)
- arr_value = np.array(value)
- # cast the values to a type that can hold nan (if necessary)
- if not self._can_hold_element(value):
- dtype, _ = com._maybe_promote(arr_value.dtype)
- values = values.astype(dtype)
- transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x)
- values = transf(values)
- l = len(values)
- # length checking
- # boolean with truth values == len of the value is ok too
- if isinstance(indexer, (np.ndarray, list)):
- if is_list_like(value) and len(indexer) != len(value):
- if not (isinstance(indexer, np.ndarray) and
- indexer.dtype == np.bool_ and
- len(indexer[indexer]) == len(value)):
- raise ValueError("cannot set using a list-like indexer "
- "with a different length than the value")
- # slice
- elif isinstance(indexer, slice):
- if is_list_like(value) and l:
- if len(value) != _length_of_indexer(indexer, values):
- raise ValueError("cannot set using a slice indexer with a "
- "different length than the value")
- try:
- # setting a single element for each dim and with a rhs that could be say a list
- # GH 6043
- if arr_value.ndim == 1 and (
- np.isscalar(indexer) or (isinstance(indexer, tuple) and all([ np.isscalar(idx) for idx in indexer ]))):
- values[indexer] = value
- # if we are an exact match (ex-broadcasting),
- # then use the resultant dtype
- elif len(arr_value.shape) and arr_value.shape[0] == values.shape[0] and np.prod(arr_value.shape) == np.prod(values.shape):
- values[indexer] = value
- values = values.astype(arr_value.dtype)
- # set
- else:
- values[indexer] = value
- # coerce and try to infer the dtypes of the result
- if np.isscalar(value):
- dtype, _ = _infer_dtype_from_scalar(value)
- else:
- dtype = 'infer'
- values = self._try_coerce_and_cast_result(values, dtype)
- return [make_block(transf(values),
- ndim=self.ndim, placement=self.mgr_locs,
- fastpath=True)]
- except (ValueError, TypeError) as detail:
- raise
- except Exception as detail:
- pass
- return [self]
- def putmask(self, mask, new, align=True, inplace=False):
- """ putmask the data to the block; it is possible that we may create a
- new dtype of block
- return the resulting block(s)
- Parameters
- ----------
- mask : the condition to respect
- new : a ndarray/object
- align : boolean, perform alignment on other/cond, default is True
- inplace : perform inplace modification, default is False
- Returns
- -------
- a new block(s), the result of the putmask
- """
- new_values = self.values if inplace else self.values.copy()
- # may need to align the new
- if hasattr(new, 'reindex_axis'):
- new = new.values.T
- # may need to align the mask
- if hasattr(mask, 'reindex_axis'):
- mask = mask.values.T
- # if we are passed a scalar None, convert it here
- if not is_list_like(new) and isnull(new):
- new = self.fill_value
- if self._can_hold_element(new):
- new = self._try_cast(new)
- # pseudo-broadcast
- if isinstance(new, np.ndarray) and new.ndim == self.ndim - 1:
- new = np.repeat(new, self.shape[-1]).reshape(self.shape)
- np.putmask(new_values, mask, new)
- # maybe upcast me
- elif mask.any():
- # need to go column by column
- new_blocks = []
- if self.ndim > 1:
- for i, ref_loc in enumerate(self.mgr_locs):
- m = mask[i]
- v = new_values[i]
- # need a new block
- if m.any():
- n = new[i] if isinstance(
- new, np.ndarray) else np.array(new)
- # type of the new block
- dtype, _ = com._maybe_promote(n.dtype)
- # we need to exiplicty astype here to make a copy
- n = n.astype(dtype)
- nv = _putmask_smart(v, m, n)
- else:
- nv = v if inplace else v.copy()
- # Put back the dimension that was taken from it and make
- # a block out of the result.
- block = make_block(values=nv[np.newaxis],
- placement=[ref_loc],
- fastpath=True)
- new_blocks.append(block)
- else:
- nv = _putmask_smart(new_values, mask, new)
- new_blocks.append(make_block(values=nv,
- placement=self.mgr_locs,
- fastpath=True))
- return new_blocks
- if inplace:
- return [self]
- return [make_block(new_values,
- placement=self.mgr_locs, fastpath=True)]
- def interpolate(self, method='pad', axis=0, index=None,
- values=None, inplace=False, limit=None,
- fill_value=None, coerce=False, downcast=None, **kwargs):
- def check_int_bool(self, inplace):
- # Only FloatBlocks will contain NaNs.
- # timedelta subclasses IntBlock
- if (self.is_bool or self.is_integer) and not self.is_timedelta:
- if inplace:
- return self
- else:
- return self.copy()
- # a fill na type method
- try:
- m = com._clean_fill_method(method)
- except:
- m = None
- if m is not None:
- r = check_int_bool(self, inplace)
- if r is not None:
- return r
- return self._interpolate_with_fill(method=m,
- axis=axis,
- inplace=inplace,
- limit=limit,
- fill_value=fill_value,
- coerce=coerce,
- downcast=downcast)
- # try an interp method
- try:
- m = com._clean_interp_method(method, **kwargs)
- except:
- m = None
- if m is not None:
- r = check_int_bool(self, inplace)
- if r is not None:
- return r
- return self._interpolate(method=m,
- index=index,
- values=values,
- axis=axis,
- limit=limit,
- fill_value=fill_value,
- inplace=inplace,
- downcast=downcast,
- **kwargs)
- raise ValueError("invalid method '{0}' to interpolate.".format(method))
- def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
- limit=None, fill_value=None, coerce=False,
- downcast=None):
- """ fillna but using the interpolate machinery """
- # if we are coercing, then don't force the conversion
- # if the block can't hold the type
- if coerce:
- if not self._can_hold_na:
- if inplace:
- return [self]
- else:
- return [self.copy()]
- fill_value = self._try_fill(fill_value)
- values = self.values if inplace else self.values.copy()
- values = self._try_operate(values)
- values = com.interpolate_2d(values,
- method=method,
- axis=axis,
- limit=limit,
- fill_value=fill_value,
- dtype=self.dtype)
- values = self._try_coerce_result(values)
- blocks = [make_block(values,
- ndim=self.ndim, klass=self.__class__,
- fastpath=True, placement=self.mgr_locs)]
- return self._maybe_downcast(blocks, downcast)
- def _interpolate(self, method=None, index=None, values=None,
- fill_value=None, axis=0, limit=None,
- inplace=False, downcast=None, **kwargs):
- """ interpolate using scipy wrappers """
- data = self.values if inplace else self.values.copy()
- # only deal with floats
- if not self.is_float:
- if not self.is_integer:
- return self
- data = data.astype(np.float64)
- if fill_value is None:
- fill_value = self.fill_value
- if method in ('krogh', 'piecewise_polynomial', 'pchip'):
- if not index.is_monotonic:
- raise ValueError("{0} interpolation requires that the "
- "index be monotonic.".format(method))
- # process 1-d slices in the axis direction
- def func(x):
- # process a 1-d slice, returning it
- # should the axis argument be handled below in apply_along_axis?
- # i.e. not an arg to com.interpolate_1d
- return com.interpolate_1d(index, x, method=method, limit=limit,
- fill_value=fill_value,
- bounds_error=False, **kwargs)
- # interp each column independently
- interp_values = np.apply_along_axis(func, axis, data)
- blocks = [make_block(interp_values,
- ndim=self.ndim, klass=self.__class__,
- fastpath=True, placement=self.mgr_locs)]
- return self._maybe_downcast(blocks, downcast)
- def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):
- """
- Take values according to indexer and return them as a block.bb
- """
- if fill_tuple is None:
- fill_value = self.fill_value
- new_values = com.take_nd(self.get_values(), indexer, axis=axis,
- allow_fill=False)
- else:
- fill_value = fill_tuple[0]
- new_values = com.take_nd(self.get_values(), indexer, axis=axis,
- allow_fill=True, fill_value=fill_value)
- if new_mgr_locs is None:
- if axis == 0:
- slc = lib.indexer_as_slice(indexer)
- if slc is not None:
- new_mgr_locs = self.mgr_locs[slc]
- else:
- new_mgr_locs = self.mgr_locs[indexer]
- else:
- new_mgr_locs = self.mgr_locs
- if new_values.dtype != self.dtype:
- return make_block(new_values, new_mgr_locs)
- else:
- return self.make_block_same_class(new_values, new_mgr_locs)
- def get_values(self, dtype=None):
- return self.values
- def diff(self, n):
- """ return block for the diff of the values """
- new_values = com.diff(self.values, n, axis=1)
- return [make_block(values=new_values,
- ndim=self.ndim, fastpath=True,
- placement=self.mgr_locs)]
- def shift(self, periods, axis=0):
- """ shift the block by periods, possibly upcast """
- # convert integer to float if necessary. need to do a lot more than
- # that, handle boolean etc also
- new_values, fill_value = com._maybe_upcast(self.values)
- # make sure array sent to np.roll is c_contiguous
- f_ordered = new_values.flags.f_contiguous
- if f_ordered:
- new_values = new_values.T
- axis = new_values.ndim - axis - 1
- new_values = np.roll(new_values, periods, axis=axis)
- axis_indexer = [ slice(None) ] * self.ndim
- if periods > 0:
- axis_indexer[axis] = slice(None,periods)
- else:
- axis_indexer[axis] = slice(periods,None)
- new_values[tuple(axis_indexer)] = fill_value
- # restore original order
- if f_ordered:
- new_values = new_values.T
- return [make_block(new_values,
- ndim=self.ndim, fastpath=True,
- placement=self.mgr_locs)]
- def eval(self, func, other, raise_on_error=True, try_cast=False):
- """
- evaluate the block; return result block from the result
- Parameters
- ----------
- func : how to combine self, other
- other : a ndarray/object
- raise_on_error : if True, raise when I can't perform the function,
- False by default (and just return the data that we had coming in)
- Returns
- -------
- a new block, the result of the func
- """
- values = self.values
- if hasattr(other, 'reindex_axis'):
- other = other.values
- # make sure that we can broadcast
- is_transposed = False
- if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
- if values.ndim != other.ndim:
- is_transposed = True
- else:
- if values.shape == other.shape[::-1]:
- is_transposed = True
- elif values.shape[0] == other.shape[-1]:
- is_transposed = True
- else:
- # this is a broadcast error heree
- raise ValueError("cannot broadcast shape [%s] with block "
- "values [%s]" % (values.T.shape,
- other.shape))
- transf = (lambda x: x.T) if is_transposed else (lambda x: x)
- # coerce/transpose the args if needed
- values, other = self._try_coerce_args(transf(values), other)
- # get the result, may need to transpose the other
- def get_result(other):
- return self._try_coerce_result(func(values, other))
- # error handler if we have an issue operating with the function
- def handle_error():
- if raise_on_error:
- raise TypeError('Could not operate %s with block values %s'
- % (repr(other), str(detail)))
- else:
- # return the values
- result = np.empty(values.shape, dtype='O')
- result.fill(np.nan)
- return result
- # get the result
- try:
- result = get_result(other)
- # if we have an invalid shape/broadcast error
- # GH4576, so raise instead of allowing to pass through
- except ValueError as detail:
- raise
- except Exception as detail:
- result = handle_error()
- # technically a broadcast error in numpy can 'work' by returning a
- # boolean False
- if not isinstance(result, np.ndarray):
- if not isinstance(result, np.ndarray):
- # differentiate between an invalid ndarray-ndarray comparison
- # and an invalid type comparison
- if isinstance(values, np.ndarray) and is_list_like(other):
- raise ValueError('Invalid broadcasting comparison [%s] '
- 'with block values' % repr(other))
- raise TypeError('Could not compare [%s] with block values'
- % repr(other))
- # transpose if needed
- result = transf(result)
- # try to cast if requested
- if try_cast:
- result = self._try_cast_result(result)
- return [make_block(result, ndim=self.ndim,
- fastpath=True, placement=self.mgr_locs)]
- def where(self, other, cond, align=True, raise_on_error=True,
- try_cast=False):
- """
- evaluate the block; return result block(s) from the result
- Parameters
- ----------
- other : a ndarray/object
- cond : the condition to respect
- align : boolean, perform alignment on other/cond
- raise_on_error : if True, raise when I can't perform the function,
- False by default (and just return the data that we had coming in)
- Returns
- -------
- a new block(s), the result of the func
- """
- values = self.values
- # see if we can align other
- if hasattr(other, 'reindex_axis'):
- other = other.values
- # make sure that we can broadcast
- is_transposed = False
- if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
- if values.ndim != other.ndim or values.shape == other.shape[::-1]:
- # if its symmetric are ok, no reshaping needed (GH 7506)
- if (values.shape[0] == np.array(values.shape)).all():
- pass
- # pseodo broadcast (its a 2d vs 1d say and where needs it in a
- # specific direction)
- elif (other.ndim >= 1 and values.ndim - 1 == other.ndim and
- values.shape[0] != other.shape[0]):
- other = _block_shape(other).T
- else:
- values = values.T
- is_transposed = True
- # see if we can align cond
- if not hasattr(cond, 'shape'):
- raise ValueError(
- "where must have a condition that is ndarray like")
- if hasattr(cond, 'reindex_axis'):
- cond = cond.values
- # may need to undo transpose of values
- if hasattr(values, 'ndim'):
- if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
- values = values.T
- is_transposed = not is_transposed
- # our where function
- def func(c, v, o):
- if c.ravel().all():
- return v
- v, o = self._try_coerce_args(v, o)
- try:
- return self._try_coerce_result(
- expressions.where(c, v, o, raise_on_error=True)
- )
- except Exception as detail:
- if raise_on_error:
- raise TypeError('Could not operate [%s] with block values '
- '[%s]' % (repr(o), str(detail)))
- else:
- # return the values
- result = np.empty(v.shape, dtype='float64')
- result.fill(np.nan)
- return result
- # see if we can operate on the entire block, or need item-by-item
- # or if we are a single block (ndim == 1)
- result = func(cond, values, other)
- if self._can_hold_na or self.ndim == 1:
- if not isinstance(result, np.ndarray):
- raise TypeError('Could not compare [%s] with block values'
- % repr(other))
- if is_transposed:
- result = result.T
- # try to cast if requested
- if try_cast:
- result = self._try_cast_result(result)
- return make_block(result,
- ndim=self.ndim, placement=self.mgr_locs)
- # might need to separate out blocks
- axis = cond.ndim - 1
- cond = cond.swapaxes(axis, 0)
- mask = np.array([cond[i].all() for i in range(cond.shape[0])],
- dtype=bool)
- result_blocks = []
- for m in [mask, ~mask]:
- if m.any():
- r = self._try_cast_result(
- result.take(m.nonzero()[0], axis=axis))
- result_blocks.append(make_block(r.T,
- placement=self.mgr_locs[m]))
- return result_blocks
- def equals(self, other):
- if self.dtype != other.dtype or self.shape != other.shape: return False
- return np.array_equal(self.values, other.values)
- class NumericBlock(Block):
- __slots__ = ()
- is_numeric = True
- _can_hold_na = True
- class FloatOrComplexBlock(NumericBlock):
- __slots__ = ()
- def equals(self, other):
- if self.dtype != other.dtype or self.shape != other.shape: return False
- left, right = self.values, other.values
- return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
- class FloatBlock(FloatOrComplexBlock):
- __slots__ = ()
- is_float = True
- _downcast_dtype = 'int64'
- def _can_hold_element(self, element):
- if is_list_like(element):
- element = np.array(element)
- tipo = element.dtype.type
- return issubclass(tipo, (np.floating, np.integer)) and not issubclass(
- tipo, (np.datetime64, np.timedelta64))
- return isinstance(element, (float, int, np.float_, np.int_)) and not isinstance(
- element, (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64))
- def _try_cast(self, element):
- try:
- return float(element)
- except: # pragma: no cover
- return element
- def to_native_types(self, slicer=None, na_rep='', float_format=None,
- **kwargs):
- """ convert to our native types format, slicing if desired """
- values = self.values
- if slicer is not None:
- values = values[:, slicer]
- values = np.array(values, dtype=object)
- mask = isnull(values)
- values[mask] = na_rep
- if float_format:
- imask = (~mask).ravel()
- values.flat[imask] = np.array(
- [float_format % val for val in values.ravel()[imask]])
- return values.tolist()
- def should_store(self, value):
- # when inserting a column should not coerce integers to floats
- # unnecessarily
- return (issubclass(value.dtype.type, np.floating) and
- value.dtype == self.dtype)
- class ComplexBlock(FloatOrComplexBlock):
- __slots__ = ()
- is_complex = True
- def _can_hold_element(self, element):
- if is_list_like(element):
- element = np.array(element)
- return issubclass(element.dtype.type, (np.floating, np.integer, np.complexfloating))
- return (isinstance(element, (float, int, complex, np.float_, np.int_)) and
- not isinstance(bool, np.bool_))
- def _try_cast(self, element):
- try:
- return complex(element)
- except: # pragma: no cover
- return element
- def should_store(self, value):
- return issubclass(value.dtype.type, np.complexfloating)
- class IntBlock(NumericBlock):
- __slots__ = ()
- is_integer = True
- _can_hold_na = False
- def _can_hold_element(self, element):
- if is_list_like(element):
- element = np.array(element)
- tipo = element.dtype.type
- return issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))
- return com.is_integer(element)
- def _try_cast(self, element):
- try:
- return int(element)
- except: # pragma: no cover
- return element
- def should_store(self, value):
- return com.is_integer_dtype(value) and value.dtype == self.dtype
- class TimeDeltaBlock(IntBlock):
- __slots__ = ()
- is_timedelta = True
- _can_hold_na = True
- is_numeric = False
- @property
- def fill_value(self):
- return tslib.iNaT
- def _try_fill(self, value):
- """ if we are a NaT, return the actual fill value """
- if isinstance(value, type(tslib.NaT)) or np.array(isnull(value)).all():
- value = tslib.iNaT
- elif isinstance(value, np.timedelta64):
- pass
- elif com.is_integer(value):
- # coerce to seconds of timedelta
- value = np.timedelta64(int(value * 1e9))
- elif isinstance(value, timedelta):
- value = np.timedelta64(value)
- return value
- def _try_coerce_args(self, values, other):
- """ provide coercion to our input arguments
- we are going to compare vs i8, so coerce to floats
- repring NaT with np.nan so nans propagate
- values is always ndarray like, other may not be """
- def masker(v):
- mask = isnull(v)
- v = v.view('i8').astype('float64')
- v[mask] = np.nan
- return v
- values = masker(values)
- if _is_null_datelike_scalar(other):
- other = np.nan
- elif isinstance(other, np.timedelta64):
- other = _coerce_scalar_to_timedelta_type(other, unit='s').item()
- if other == tslib.iNaT:
- other = np.nan
- else:
- other = masker(other)
- return values, other
- def _try_operate(self, values):
- """ return a version to operate on """
- return values.view('i8')
- def _try_coerce_result(self, result):
- """ reverse of try_coerce_args / try_operate """
- if isinstance(result, np.ndarray):
- mask = isnull(result)
- if result.dtype.kind in ['i', 'f', 'O']:
- result = result.astype('m8[ns]')
- result[mask] = tslib.iNaT
- elif isinstance(result, np.integer):
- result = np.timedelta64(result)
- return result
- def should_store(self, value):
- return issubclass(value.dtype.type, np.timedelta64)
- def to_native_types(self, slicer=None, na_rep=None, **kwargs):
- """ convert to our native types format, slicing if desired """
- values = self.values
- if slicer is not None:
- values = values[:, slicer]
- mask = isnull(values)
- rvalues = np.empty(values.shape, dtype=object)
- if na_rep is None:
- na_rep = 'NaT'
- rvalues[mask] = na_rep
- imask = (~mask).ravel()
- rvalues.flat[imask] = np.array([lib.repr_timedelta64(val)
- for val in values.ravel()[imask]],
- dtype=object)
- return rvalues.tolist()
- class BoolBlock(NumericBlock):
- __slots__ = ()
- is_bool = True
- _can_hold_na = False
- def _can_hold_element(self, element):
- if is_list_like(element):
- element = np.array(element)
- return issubclass(element.dtype.type, np.integer)
- return isinstance(element, (int, bool))
- def _try_cast(self, element):
- try:
- return bool(element)
- except: # pragma: no cover
- return element
- def should_store(self, value):
- return issubclass(value.dtype.type, np.bool_)
- def replace(self, to_replace, value, inplace=False, filter=None,
- regex=False):
- to_replace_values = np.atleast_1d(to_replace)
- if not np.can_cast(to_replace_values, bool):
- return self
- return super(BoolBlock, self).replace(to_replace, value,
- inplace=inplace, filter=filter,
- regex=regex)
- class ObjectBlock(Block):
- __slots__ = ()
- is_object = True
- _can_hold_na = True
- def __init__(self, values, ndim=2, fastpath=False,
- placement=None):
- if issubclass(values.dtype.type, compat.string_types):
- values = np.array(values, dtype=object)
- super(ObjectBlock, self).__init__(values, ndim=ndim,
- fastpath=fastpath,
- placement=placement)
- @property
- def is_bool(self):
- """ we can be a bool if we have only bool values but are of type
- object
- """
- return lib.is_bool_array(self.values.ravel())
- def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=True,
- copy=True, by_item=True):
- """ attempt to coerce any object types to better types
- return a copy of the block (if copy = True)
- by definition we ARE an ObjectBlock!!!!!
- can return multiple blocks!
- """
- # attempt to create new type blocks
- blocks = []
- if by_item and not self._is_single_block:
- for i, rl in enumerate(self.mgr_locs):
- values = self.iget(i)
- values = com._possibly_convert_objects(
- values.ravel(), convert_dates=convert_dates,
- convert_numeric=convert_numeric,
- convert_timedeltas=convert_timedeltas,
- ).reshape(values.shape)
- values = _block_shape(values, ndim=self.ndim)
- newb = make_block(values,
- ndim=self.ndim, placement=[rl])
- blocks.append(newb)
- else:
- values = com._possibly_convert_objects(
- self.values.ravel(), convert_dates=convert_dates,
- convert_numeric=convert_numeric
- ).reshape(self.values.shape)
- blocks.append(make_block(values,
- ndim=self.ndim, placement=self.mgr_locs))
- return blocks
- def set(self, locs, values, check=False):
- """
- Modify Block in-place with new item value
- Returns
- -------
- None
- """
- # GH6026
- if check:
- try:
- if (self.values[locs] == values).all():
- return
- except:
- pass
- try:
- self.values[locs] = values
- except (ValueError):
- # broadcasting error
- # see GH6171
- new_shape = list(values.shape)
- new_shape[0] = len(self.items)
- self.values = np.empty(tuple(new_shape),dtype=self.dtype)
- self.values.fill(np.nan)
- self.values[locs] = values
- def _maybe_downcast(self, blocks, downcast=None):
- if downcast is not None:
- return blocks
- # split and convert the blocks
- result_blocks = []
- for blk in blocks:
- result_blocks.extend(blk.convert(convert_dates=True,
- convert_numeric=False))
- return result_blocks
- def _can_hold_element(self, element):
- return True
- def _try_cast(self, element):
- return element
- def should_store(self, value):
- return not issubclass(value.dtype.type,
- (np.integer, np.floating, np.complexfloating,
- np.datetime64, np.bool_))
- def replace(self, to_replace, value, inplace=False, filter=None,
- regex=False):
- blk = [self]
- to_rep_is_list = com.is_list_like(to_replace)
- value_is_list = com.is_list_like(value)
- both_lists = to_rep_is_list and value_is_list
- either_list = to_rep_is_list or value_is_list
- if not either_list and com.is_re(to_replace):
- blk[0], = blk[0]._replace_single(to_replace, value,
- inplace=inplace, filter=filter,
- regex=True)
- elif not (either_list or regex):
- blk = super(ObjectBlock, self).replace(to_replace, value,
- inplace=inplace,
- filter=filter, regex=regex)
- elif both_lists:
- for to_rep, v in zip(to_replace, value):
- blk[0], = blk[0]._replace_single(to_rep, v, inplace=inplace,
- filter=filter, regex=regex)
- elif to_rep_is_list and regex:
- for to_rep in to_replace:
- blk[0], = blk[0]._replace_single(to_rep, value,
- inplace=inplace,
- filter=filter, regex=regex)
- else:
- blk[0], = blk[0]._replace_single(to_replace, value,
- inplace=inplace, filter=filter,
- regex=regex)
- return blk
- def _replace_single(self, to_replace, value, inplace=False, filter=None,
- regex=False):
- # to_replace is regex compilable
- to_rep_re = regex and com.is_re_compilable(to_replace)
- # regex is regex compilable
- regex_re = com.is_re_compilable(regex)
- # only one will survive
- if to_rep_re and regex_re:
- raise AssertionError('only one of to_replace and regex can be '
- 'regex compilable')
- # if regex was passed as something that can be a regex (rather than a
- # boolean)
- if regex_re:
- to_replace = regex
- regex = regex_re or to_rep_re
- # try to get the pattern attribute (compiled re) or it's a string
- try:
- pattern = to_replace.pattern
- except AttributeError:
- pattern = to_replace
- # if the pattern is not empty and to_replace is either a string or a
- # regex
- if regex and pattern:
- rx = re.compile(to_replace)
- else:
- # if the thing to replace is not a string or compiled regex call
- # the superclass method -> to_replace is some kind of object
- result = super(ObjectBlock, self).replace(to_replace, value,
- inplace=inplace,
- filter=filter,
- regex=regex)
- if not isinstance(result, list):
- result = [result]
- return result
- new_values = self.values if inplace else self.values.copy()
- # deal with replacing values with objects (strings) that match but
- # whose replacement is not a string (numeric, nan, object)
- if isnull(value) or not isinstance(value, compat.string_types):
- def re_replacer(s):
- try:
- return value if rx.search(s) is not None else s
- except TypeError:
- return s
- else:
- # value is guaranteed to …
Large files files are truncated, but you can click here to view the full file