array.py - This is a Python implementation of sparse data s…

/pandas/sparse/array.py

http://github.com/wesm/pandas · Python · 753 lines · 677 code · 28 blank · 48 comment · 35 complexity · 51062ea59eed372167f748aa5f342da6 MD5 · raw file

"""
SparseArray data structure
"""
from __future__ import division
# pylint: disable=E1101,E1103,W0231

from numpy import nan, ndarray
import numpy as np

import pandas as pd
from pandas.core.base import PandasObject

from pandas import compat, lib
from pandas.compat import range
from pandas.compat.numpy import function as nv

from pandas.types.generic import ABCSparseArray, ABCSparseSeries
from pandas.types.common import (is_float, is_integer,
                                 is_integer_dtype, _ensure_platform_int,
                                 is_bool_dtype,
                                 is_list_like,
                                 is_scalar, is_dtype_equal)
from pandas.types.cast import (_possibly_convert_platform, _maybe_promote,
                               _astype_nansafe)
from pandas.types.missing import isnull, notnull

from pandas._sparse import SparseIndex, BlockIndex, IntIndex
import pandas._sparse as splib
import pandas.index as _index
import pandas.core.algorithms as algos
import pandas.core.ops as ops
import pandas.formats.printing as printing
from pandas.util.decorators import Appender
from pandas.indexes.base import _index_shared_docs


_sparray_doc_kwargs = dict(klass='SparseArray')


def _arith_method(op, name, str_rep=None, default_axis=None, fill_zeros=None,
                  **eval_kwargs):
    """
    Wrapper function for Series arithmetic operations, to avoid
    code duplication.
    """

    def wrapper(self, other):
        if isinstance(other, np.ndarray):
            if len(self) != len(other):
                raise AssertionError("length mismatch: %d vs. %d" %
                                     (len(self), len(other)))
            if not isinstance(other, ABCSparseArray):
                dtype = getattr(other, 'dtype', None)
                other = SparseArray(other, fill_value=self.fill_value,
                                    dtype=dtype)
            return _sparse_array_op(self, other, op, name)
        elif is_scalar(other):
            fill = op(_get_fill(self), np.asarray(other))
            return _wrap_result(name, op(self.sp_values, other),
                                self.sp_index, fill)
        else:  # pragma: no cover
            raise TypeError('operation with %s not supported' % type(other))

    if name.startswith("__"):
        name = name[2:-2]
    wrapper.__name__ = name
    return wrapper


def _maybe_match_dtype(left, right):
    if not hasattr(right, 'dtype'):
        return left.dtype
    elif left.dtype == right.dtype:
        return getattr(left.dtype, '__name__', left.dtype)
    else:
        # ToDo: to be supported after GH 667
        raise NotImplementedError('dtypes must be identical')


def _get_fill(arr):
    # coerce fill_value to arr dtype if possible
    # int64 SparseArray can have NaN as fill_value if there is no missing
    try:
        return np.asarray(arr.fill_value, dtype=arr.dtype)
    except ValueError:
        return np.asarray(arr.fill_value)


def _sparse_array_op(left, right, op, name, series=False):

    if series and is_integer_dtype(left) and is_integer_dtype(right):
        # series coerces to float64 if result should have NaN/inf
        if name in ('floordiv', 'mod') and (right.values == 0).any():
            left = left.astype(np.float64)
            right = right.astype(np.float64)
        elif name in ('rfloordiv', 'rmod') and (left.values == 0).any():
            left = left.astype(np.float64)
            right = right.astype(np.float64)

    dtype = _maybe_match_dtype(left, right)

    if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
        result = op(left.get_values(), right.get_values())

        if left.sp_index.ngaps == 0:
            index = left.sp_index
        else:
            index = right.sp_index
        fill = op(_get_fill(left), _get_fill(right))
    elif left.sp_index.equals(right.sp_index):
        result = op(left.sp_values, right.sp_values)
        index = left.sp_index
        fill = op(_get_fill(left), _get_fill(right))
    else:
        if name[0] == 'r':
            left, right = right, left
            name = name[1:]

        opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype)
        sparse_op = getattr(splib, opname)

        result, index, fill = sparse_op(left.sp_values, left.sp_index,
                                        left.fill_value, right.sp_values,
                                        right.sp_index, right.fill_value)
    return _wrap_result(name, result, index, fill, dtype=result.dtype)


def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
    """ wrap op result to have correct dtype """
    if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
        # ToDo: We can remove this condition when removing
        # SparseArray's dtype default when closing GH 667
        dtype = np.bool
    elif name == 'truediv':
        dtype = np.float64
    return SparseArray(data, sparse_index=sparse_index,
                       fill_value=fill_value, dtype=dtype)


class SparseArray(PandasObject, np.ndarray):
    """Data structure for labeled, sparse floating point 1-D data

    Parameters
    ----------
    data : {array-like (1-D), Series, SparseSeries, dict}
    kind : {'block', 'integer'}
    fill_value : float
        Defaults to NaN (code for missing)
    sparse_index : {BlockIndex, IntIndex}, optional
        Only if you have one. Mainly used internally

    Notes
    -----
    SparseArray objects are immutable via the typical Python means. If you
    must change values, convert to dense, make your changes, then convert back
    to sparse
    """
    __array_priority__ = 15
    _typ = 'array'
    _subtyp = 'sparse_array'

    sp_index = None
    fill_value = None

    def __new__(cls, data, sparse_index=None, index=None, kind='integer',
                fill_value=None, dtype=np.float64, copy=False):

        if index is not None:
            if data is None:
                data = np.nan
            if not is_scalar(data):
                raise Exception("must only pass scalars with an index ")
            values = np.empty(len(index), dtype='float64')
            values.fill(data)
            data = values

        if dtype is not None:
            dtype = np.dtype(dtype)
        is_sparse_array = isinstance(data, SparseArray)
        if fill_value is None:
            if is_sparse_array:
                fill_value = data.fill_value
            else:
                fill_value = nan

        if is_sparse_array:
            sparse_index = data.sp_index
            values = np.asarray(data)
        else:
            # array-like
            if sparse_index is None:
                values, sparse_index = make_sparse(data, kind=kind,
                                                   fill_value=fill_value)
            else:
                values = _sanitize_values(data)
                if len(values) != sparse_index.npoints:
                    raise AssertionError("Non array-like type {0} must have"
                                         " the same length as the"
                                         " index".format(type(values)))

        # Create array, do *not* copy data by default
        if copy:
            try:
                # ToDo: Can remove this error handling when we actually
                # support other dtypes
                subarr = np.array(values, dtype=dtype, copy=True)
            except ValueError:
                subarr = np.array(values, copy=True)
        else:
            try:
                subarr = np.asarray(values, dtype=dtype)
            except ValueError:
                subarr = np.asarray(values)

        # if we have a bool type, make sure that we have a bool fill_value
        if ((dtype is not None and issubclass(dtype.type, np.bool_)) or
                (data is not None and lib.is_bool_array(subarr))):
            if np.isnan(fill_value) or not fill_value:
                fill_value = False
            else:
                fill_value = bool(fill_value)

        # Change the class of the array to be the subclass type.
        return cls._simple_new(subarr, sparse_index, fill_value)

    @classmethod
    def _simple_new(cls, data, sp_index, fill_value):
        if (is_integer_dtype(data) and is_float(fill_value) and
           sp_index.ngaps > 0):
            # if float fill_value is being included in dense repr,
            # convert values to float
            data = data.astype(float)

        result = data.view(cls)

        if not isinstance(sp_index, SparseIndex):
            # caller must pass SparseIndex
            raise ValueError('sp_index must be a SparseIndex')

        result.sp_index = sp_index
        result._fill_value = fill_value
        return result

    @property
    def _constructor(self):
        return lambda x: SparseArray(x, fill_value=self.fill_value,
                                     kind=self.kind)

    @property
    def kind(self):
        if isinstance(self.sp_index, BlockIndex):
            return 'block'
        elif isinstance(self.sp_index, IntIndex):
            return 'integer'

    def __array_wrap__(self, out_arr, context=None):
        """
        NumPy calls this method when ufunc is applied

        Parameters
        ----------

        out_arr : ndarray
            ufunc result (note that ufunc is only applied to sp_values)
        context : tuple of 3 elements (ufunc, signature, domain)
            for example, following is a context when np.sin is applied to
            SparseArray,

            (<ufunc 'sin'>, (SparseArray,), 0))

        See http://docs.scipy.org/doc/numpy/user/basics.subclassing.html
        """
        if isinstance(context, tuple) and len(context) == 3:
            ufunc, args, domain = context
            # to apply ufunc only to fill_value (to avoid recursive call)
            args = [getattr(a, 'fill_value', a) for a in args]
            fill_value = ufunc(self.fill_value, *args[1:])
        else:
            fill_value = self.fill_value

        return self._simple_new(out_arr, sp_index=self.sp_index,
                                fill_value=fill_value)

    def __array_finalize__(self, obj):
        """
        Gets called after any ufunc or other array operations, necessary
        to pass on the index.
        """
        self.sp_index = getattr(obj, 'sp_index', None)
        self._fill_value = getattr(obj, 'fill_value', None)

    def __reduce__(self):
        """Necessary for making this object picklable"""
        object_state = list(ndarray.__reduce__(self))
        subclass_state = self.fill_value, self.sp_index
        object_state[2] = (object_state[2], subclass_state)
        return tuple(object_state)

    def __setstate__(self, state):
        """Necessary for making this object picklable"""
        nd_state, own_state = state
        ndarray.__setstate__(self, nd_state)

        fill_value, sp_index = own_state[:2]
        self.sp_index = sp_index
        self._fill_value = fill_value

    def __len__(self):
        try:
            return self.sp_index.length
        except:
            return 0

    def __unicode__(self):
        return '%s\nFill: %s\n%s' % (printing.pprint_thing(self),
                                     printing.pprint_thing(self.fill_value),
                                     printing.pprint_thing(self.sp_index))

    def disable(self, other):
        raise NotImplementedError('inplace binary ops not supported')
    # Inplace operators
    __iadd__ = disable
    __isub__ = disable
    __imul__ = disable
    __itruediv__ = disable
    __ifloordiv__ = disable
    __ipow__ = disable

    # Python 2 division operators
    if not compat.PY3:
        __idiv__ = disable

    @property
    def values(self):
        """
        Dense values
        """
        output = np.empty(len(self), dtype=self.dtype)
        int_index = self.sp_index.to_int_index()
        output.fill(self.fill_value)
        output.put(int_index.indices, self)
        return output

    @property
    def sp_values(self):
        # caching not an option, leaks memory
        return self.view(np.ndarray)

    @property
    def fill_value(self):
        return self._fill_value

    @fill_value.setter
    def fill_value(self, value):
        if not is_scalar(value):
            raise ValueError('fill_value must be a scalar')
        # if the specified value triggers type promotion, raise ValueError
        new_dtype, fill_value = _maybe_promote(self.dtype, value)
        if is_dtype_equal(self.dtype, new_dtype):
            self._fill_value = fill_value
        else:
            msg = 'unable to set fill_value {0} to {1} dtype'
            raise ValueError(msg.format(value, self.dtype))

    def get_values(self, fill=None):
        """ return a dense representation """
        return self.to_dense(fill=fill)

    def to_dense(self, fill=None):
        """
        Convert SparseSeries to (dense) Series
        """
        return self.values

    def __iter__(self):
        for i in range(len(self)):
            yield self._get_val_at(i)

    def __getitem__(self, key):
        """

        """
        if is_integer(key):
            return self._get_val_at(key)
        elif isinstance(key, tuple):
            data_slice = self.values[key]
        else:
            if isinstance(key, SparseArray):
                if is_bool_dtype(key):
                    key = key.to_dense()
                else:
                    key = np.asarray(key)

            if hasattr(key, '__len__') and len(self) != len(key):
                return self.take(key)
            else:
                data_slice = self.values[key]

        return self._constructor(data_slice)

    def __getslice__(self, i, j):
        if i < 0:
            i = 0
        if j < 0:
            j = 0
        slobj = slice(i, j)
        return self.__getitem__(slobj)

    def _get_val_at(self, loc):
        n = len(self)
        if loc < 0:
            loc += n

        if loc >= n or loc < 0:
            raise IndexError('Out of bounds access')

        sp_loc = self.sp_index.lookup(loc)
        if sp_loc == -1:
            return self.fill_value
        else:
            return _index.get_value_at(self, sp_loc)

    @Appender(_index_shared_docs['take'] % _sparray_doc_kwargs)
    def take(self, indices, axis=0, allow_fill=True,
             fill_value=None, **kwargs):
        """
        Sparse-compatible version of ndarray.take

        Returns
        -------
        taken : ndarray
        """
        nv.validate_take(tuple(), kwargs)

        if axis:
            raise ValueError("axis must be 0, input was {0}".format(axis))

        if is_integer(indices):
            # return scalar
            return self[indices]

        indices = _ensure_platform_int(indices)
        n = len(self)
        if allow_fill and fill_value is not None:
            # allow -1 to indicate self.fill_value,
            # self.fill_value may not be NaN
            if (indices < -1).any():
                msg = ('When allow_fill=True and fill_value is not None, '
                       'all indices must be >= -1')
                raise ValueError(msg)
            elif (n <= indices).any():
                msg = 'index is out of bounds for size {0}'
                raise IndexError(msg.format(n))
        else:
            if ((indices < -n) | (n <= indices)).any():
                msg = 'index is out of bounds for size {0}'
                raise IndexError(msg.format(n))

        indices = indices.astype(np.int32)
        if not (allow_fill and fill_value is not None):
            indices = indices.copy()
            indices[indices < 0] += n

        locs = self.sp_index.lookup_array(indices)
        indexer = np.arange(len(locs), dtype=np.int32)
        mask = locs != -1
        if mask.any():
            indexer = indexer[mask]
            new_values = self.sp_values.take(locs[mask])
        else:
            indexer = np.empty(shape=(0, ), dtype=np.int32)
            new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)

        sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
        return self._simple_new(new_values, sp_index, self.fill_value)

    def __setitem__(self, key, value):
        # if is_integer(key):
        #    self.values[key] = value
        # else:
        #    raise Exception("SparseArray does not support seting non-scalars
        # via setitem")
        raise TypeError(
            "SparseArray does not support item assignment via setitem")

    def __setslice__(self, i, j, value):
        if i < 0:
            i = 0
        if j < 0:
            j = 0
        slobj = slice(i, j)  # noqa

        # if not is_scalar(value):
        #    raise Exception("SparseArray does not support seting non-scalars
        # via slices")

        # x = self.values
        # x[slobj] = value
        # self.values = x
        raise TypeError("SparseArray does not support item assignment via "
                        "slices")

    def astype(self, dtype=None, copy=True):
        dtype = np.dtype(dtype)
        sp_values = _astype_nansafe(self.sp_values, dtype, copy=copy)
        try:
            fill_value = dtype.type(self.fill_value)
        except ValueError:
            msg = 'unable to coerce current fill_value {0} to {1} dtype'
            raise ValueError(msg.format(self.fill_value, dtype))
        return self._simple_new(sp_values, self.sp_index,
                                fill_value=fill_value)

    def copy(self, deep=True):
        """
        Make a copy of the SparseSeries. Only the actual sparse values need to
        be copied
        """
        if deep:
            values = self.sp_values.copy()
        else:
            values = self.sp_values
        return SparseArray(values, sparse_index=self.sp_index,
                           dtype=self.dtype, fill_value=self.fill_value)

    def count(self):
        """
        Compute sum of non-NA/null observations in SparseSeries. If the
        fill_value is not NaN, the "sparse" locations will be included in the
        observation count

        Returns
        -------
        nobs : int
        """
        sp_values = self.sp_values
        valid_spvals = np.isfinite(sp_values).sum()
        if self._null_fill_value:
            return valid_spvals
        else:
            return valid_spvals + self.sp_index.ngaps

    @property
    def _null_fill_value(self):
        return isnull(self.fill_value)

    @property
    def _valid_sp_values(self):
        sp_vals = self.sp_values
        mask = notnull(sp_vals)
        return sp_vals[mask]

    @Appender(_index_shared_docs['fillna'] % _sparray_doc_kwargs)
    def fillna(self, value, downcast=None):
        if downcast is not None:
            raise NotImplementedError

        if issubclass(self.dtype.type, np.floating):
            value = float(value)

        if self._null_fill_value:
            return self._simple_new(self.sp_values, self.sp_index,
                                    fill_value=value)
        else:
            new_values = self.sp_values.copy()
            new_values[isnull(new_values)] = value
            return self._simple_new(new_values, self.sp_index,
                                    fill_value=self.fill_value)

    def sum(self, axis=0, *args, **kwargs):
        """
        Sum of non-NA/null values

        Returns
        -------
        sum : float
        """
        nv.validate_sum(args, kwargs)
        valid_vals = self._valid_sp_values
        sp_sum = valid_vals.sum()
        if self._null_fill_value:
            return sp_sum
        else:
            nsparse = self.sp_index.ngaps
            return sp_sum + self.fill_value * nsparse

    def cumsum(self, axis=0, *args, **kwargs):
        """
        Cumulative sum of values. Preserves locations of NaN values

        Returns
        -------
        cumsum : Series
        """
        nv.validate_cumsum(args, kwargs)

        # TODO: gh-12855 - return a SparseArray here
        if notnull(self.fill_value):
            return self.to_dense().cumsum()

        # TODO: what if sp_values contains NaN??
        return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index,
                           fill_value=self.fill_value)

    def mean(self, axis=0, *args, **kwargs):
        """
        Mean of non-NA/null values

        Returns
        -------
        mean : float
        """
        nv.validate_mean(args, kwargs)
        valid_vals = self._valid_sp_values
        sp_sum = valid_vals.sum()
        ct = len(valid_vals)

        if self._null_fill_value:
            return sp_sum / ct
        else:
            nsparse = self.sp_index.ngaps
            return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)

    def value_counts(self, dropna=True):
        """
        Returns a Series containing counts of unique values.

        Parameters
        ----------
        dropna : boolean, default True
            Don't include counts of NaN, even if NaN is in sp_values.

        Returns
        -------
        counts : Series
        """
        keys, counts = algos._value_counts_arraylike(self.sp_values,
                                                     dropna=dropna)
        fcounts = self.sp_index.ngaps
        if fcounts > 0:
            if self._null_fill_value and dropna:
                pass
            else:
                if self._null_fill_value:
                    mask = pd.isnull(keys)
                else:
                    mask = keys == self.fill_value

                if mask.any():
                    counts[mask] += fcounts
                else:
                    keys = np.insert(keys, 0, self.fill_value)
                    counts = np.insert(counts, 0, fcounts)

        if not isinstance(keys, pd.Index):
            keys = pd.Index(keys)
        result = pd.Series(counts, index=keys)
        return result


def _maybe_to_dense(obj):
    """ try to convert to dense """
    if hasattr(obj, 'to_dense'):
        return obj.to_dense()
    return obj


def _maybe_to_sparse(array):
    """ array must be SparseSeries or SparseArray """
    if isinstance(array, ABCSparseSeries):
        array = array.values.copy()
    return array


def _sanitize_values(arr):
    """
    return an ndarray for our input,
    in a platform independent manner
    """

    if hasattr(arr, 'values'):
        arr = arr.values
    else:

        # scalar
        if is_scalar(arr):
            arr = [arr]

        # ndarray
        if isinstance(arr, np.ndarray):
            pass

        elif is_list_like(arr) and len(arr) > 0:
            arr = _possibly_convert_platform(arr)

        else:
            arr = np.asarray(arr)

    return arr


def make_sparse(arr, kind='block', fill_value=nan):
    """
    Convert ndarray to sparse format

    Parameters
    ----------
    arr : ndarray
    kind : {'block', 'integer'}
    fill_value : NaN or another value

    Returns
    -------
    (sparse_values, index) : (ndarray, SparseIndex)
    """

    arr = _sanitize_values(arr)

    if arr.ndim > 1:
        raise TypeError("expected dimension <= 1 data")

    if isnull(fill_value):
        mask = notnull(arr)
    else:
        mask = arr != fill_value

    length = len(arr)
    if length != mask.size:
        # the arr is a SparseArray
        indices = mask.sp_index.indices
    else:
        indices = np.arange(length, dtype=np.int32)[mask]

    index = _make_index(length, indices, kind)
    sparsified_values = arr[mask]
    return sparsified_values, index


def _make_index(length, indices, kind):

    if kind == 'block' or isinstance(kind, BlockIndex):
        locs, lens = splib.get_blocks(indices)
        index = BlockIndex(length, locs, lens)
    elif kind == 'integer' or isinstance(kind, IntIndex):
        index = IntIndex(length, indices)
    else:  # pragma: no cover
        raise ValueError('must be block or integer type')
    return index


ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,
                                   comp_method=_arith_method,
                                   use_numexpr=False)
Summary ✨

This is a Python implementation of sparse data structures and operations for numerical computations. It provides classes for creating and manipulating sparse arrays, such as SparseArray and SparseIndex, which can efficiently store and manipulate large datasets with minimal memory usage. The code also defines various mathematical operations that can be applied to these sparse arrays.
Tech Fingerprint

Alerts (31)

'def' Ensure functions have docstrings for documentation
47 250 319 345 350 354 424 503 554
'isinstance(' Overuse may indicate design issues; consider polymorphism
48 52 179 236 251 253 273 385 388 655 670 690 741 744
'type(' Use isinstance() for type checking instead of type()
62
Complexity hotspot; line 91 (total complexity: 3)
91
'raise Exception(' Raise specific exception types for better error handling
172 481 494
'lambda' Avoid complex 'lambda' functions; prefer named functions for clarity and debugging
246
'list(' Avoid unnecessary list conversions; use generators where possible
294
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
311