/sklearn/utils/multiclass.py
Python | 388 lines | 317 code | 28 blank | 43 comment | 28 complexity | 45f569b7886575882f2cf5dd7316449a MD5 | raw file
- # Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi
- #
- # License: BSD 3 clause
- """
- Multi-class / multi-label utility function
- ==========================================
- """
- from __future__ import division
- from collections import Sequence
- from itertools import chain
- from scipy.sparse import issparse
- from scipy.sparse.base import spmatrix
- from scipy.sparse import dok_matrix
- from scipy.sparse import lil_matrix
- import numpy as np
- from ..externals.six import string_types
- from .validation import check_array
- from ..utils.fixes import bincount
- from ..utils.fixes import array_equal
- def _unique_multiclass(y):
- if hasattr(y, '__array__'):
- return np.unique(np.asarray(y))
- else:
- return set(y)
- def _unique_indicator(y):
- return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1])
- _FN_UNIQUE_LABELS = {
- 'binary': _unique_multiclass,
- 'multiclass': _unique_multiclass,
- 'multilabel-indicator': _unique_indicator,
- }
- def unique_labels(*ys):
- """Extract an ordered array of unique labels
- We don't allow:
- - mix of multilabel and multiclass (single label) targets
- - mix of label indicator matrix and anything else,
- because there are no explicit labels)
- - mix of label indicator matrices of different sizes
- - mix of string and integer labels
- At the moment, we also don't allow "multiclass-multioutput" input type.
- Parameters
- ----------
- *ys : array-likes,
- Returns
- -------
- out : numpy array of shape [n_unique_labels]
- An ordered array of unique labels.
- Examples
- --------
- >>> from sklearn.utils.multiclass import unique_labels
- >>> unique_labels([3, 5, 5, 5, 7, 7])
- array([3, 5, 7])
- >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
- array([1, 2, 3, 4])
- >>> unique_labels([1, 2, 10], [5, 11])
- array([ 1, 2, 5, 10, 11])
- """
- if not ys:
- raise ValueError('No argument has been passed.')
- # Check that we don't mix label format
- ys_types = set(type_of_target(x) for x in ys)
- if ys_types == set(["binary", "multiclass"]):
- ys_types = set(["multiclass"])
- if len(ys_types) > 1:
- raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
- label_type = ys_types.pop()
- # Check consistency for the indicator format
- if (label_type == "multilabel-indicator" and
- len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1]
- for y in ys)) > 1):
- raise ValueError("Multi-label binary indicator input with "
- "different numbers of labels")
- # Get the unique set of labels
- _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
- if not _unique_labels:
- raise ValueError("Unknown label type: %s" % repr(ys))
- ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys))
- # Check that we don't mix string type with number type
- if (len(set(isinstance(label, string_types) for label in ys_labels)) > 1):
- raise ValueError("Mix of label input types (string and number)")
- return np.array(sorted(ys_labels))
- def _is_integral_float(y):
- return y.dtype.kind == 'f' and np.all(y.astype(int) == y)
- def is_multilabel(y):
- """ Check if ``y`` is in a multilabel format.
- Parameters
- ----------
- y : numpy array of shape [n_samples]
- Target values.
- Returns
- -------
- out : bool,
- Return ``True``, if ``y`` is in a multilabel format, else ```False``.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.utils.multiclass import is_multilabel
- >>> is_multilabel([0, 1, 0, 1])
- False
- >>> is_multilabel([[1], [0, 2], []])
- False
- >>> is_multilabel(np.array([[1, 0], [0, 0]]))
- True
- >>> is_multilabel(np.array([[1], [0], [0]]))
- False
- >>> is_multilabel(np.array([[1, 0, 0]]))
- True
- """
- if hasattr(y, '__array__'):
- y = np.asarray(y)
- if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
- return False
- if issparse(y):
- if isinstance(y, (dok_matrix, lil_matrix)):
- y = y.tocsr()
- return (len(y.data) == 0 or np.unique(y.data).size == 1 and
- (y.dtype.kind in 'biu' or # bool, int, uint
- _is_integral_float(np.unique(y.data))))
- else:
- labels = np.unique(y)
- return len(labels) < 3 and (y.dtype.kind in 'biu' or # bool, int, uint
- _is_integral_float(labels))
- def check_classification_targets(y):
- """Ensure that target y is of a non-regression type.
- Only the following target types (as defined in type_of_target) are allowed:
- 'binary', 'multiclass', 'multiclass-multioutput',
- 'multilabel-indicator', 'multilabel-sequences'
- Parameters
- ----------
- y : array-like
- """
- y_type = type_of_target(y)
- if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
- 'multilabel-indicator', 'multilabel-sequences']:
- raise ValueError("Unknown label type: %r" % y_type)
- def type_of_target(y):
- """Determine the type of data indicated by target `y`
- Parameters
- ----------
- y : array-like
- Returns
- -------
- target_type : string
- One of:
- * 'continuous': `y` is an array-like of floats that are not all
- integers, and is 1d or a column vector.
- * 'continuous-multioutput': `y` is a 2d array of floats that are
- not all integers, and both dimensions are of size > 1.
- * 'binary': `y` contains <= 2 discrete values and is 1d or a column
- vector.
- * 'multiclass': `y` contains more than two discrete values, is not a
- sequence of sequences, and is 1d or a column vector.
- * 'multiclass-multioutput': `y` is a 2d array that contains more
- than two discrete values, is not a sequence of sequences, and both
- dimensions are of size > 1.
- * 'multilabel-indicator': `y` is a label indicator matrix, an array
- of two dimensions with at least two columns, and at most 2 unique
- values.
- * 'unknown': `y` is array-like but none of the above, such as a 3d
- array, sequence of sequences, or an array of non-sequence objects.
- Examples
- --------
- >>> import numpy as np
- >>> type_of_target([0.1, 0.6])
- 'continuous'
- >>> type_of_target([1, -1, -1, 1])
- 'binary'
- >>> type_of_target(['a', 'b', 'a'])
- 'binary'
- >>> type_of_target([1.0, 2.0])
- 'binary'
- >>> type_of_target([1, 0, 2])
- 'multiclass'
- >>> type_of_target([1.0, 0.0, 3.0])
- 'multiclass'
- >>> type_of_target(['a', 'b', 'c'])
- 'multiclass'
- >>> type_of_target(np.array([[1, 2], [3, 1]]))
- 'multiclass-multioutput'
- >>> type_of_target([[1, 2]])
- 'multiclass-multioutput'
- >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
- 'continuous-multioutput'
- >>> type_of_target(np.array([[0, 1], [1, 1]]))
- 'multilabel-indicator'
- """
- valid = ((isinstance(y, (Sequence, spmatrix)) or hasattr(y, '__array__'))
- and not isinstance(y, string_types))
- if not valid:
- raise ValueError('Expected array-like (array or non-string sequence), '
- 'got %r' % y)
- if is_multilabel(y):
- return 'multilabel-indicator'
- try:
- y = np.asarray(y)
- except ValueError:
- # Known to fail in numpy 1.3 for array of arrays
- return 'unknown'
- # The old sequence of sequences format
- try:
- if (not hasattr(y[0], '__array__') and isinstance(y[0], Sequence)
- and not isinstance(y[0], string_types)):
- raise ValueError('You appear to be using a legacy multi-label data'
- ' representation. Sequence of sequences are no'
- ' longer supported; use a binary array or sparse'
- ' matrix instead.')
- except IndexError:
- pass
- # Invalid inputs
- if y.ndim > 2 or (y.dtype == object and len(y) and
- not isinstance(y.flat[0], string_types)):
- return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"]
- if y.ndim == 2 and y.shape[1] == 0:
- return 'unknown' # [[]]
- if y.ndim == 2 and y.shape[1] > 1:
- suffix = "-multioutput" # [[1, 2], [1, 2]]
- else:
- suffix = "" # [1, 2, 3] or [[1], [2], [3]]
- # check float and contains non-integer float values
- if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
- # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
- return 'continuous' + suffix
- if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
- return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
- else:
- return 'binary' # [1, 2] or [["a"], ["b"]]
- def _check_partial_fit_first_call(clf, classes=None):
- """Private helper function for factorizing common classes param logic
- Estimators that implement the ``partial_fit`` API need to be provided with
- the list of possible classes at the first call to partial_fit.
- Subsequent calls to partial_fit should check that ``classes`` is still
- consistent with a previous value of ``clf.classes_`` when provided.
- This function returns True if it detects that this was the first call to
- ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
- set on ``clf``.
- """
- if getattr(clf, 'classes_', None) is None and classes is None:
- raise ValueError("classes must be passed on the first call "
- "to partial_fit.")
- elif classes is not None:
- if getattr(clf, 'classes_', None) is not None:
- if not array_equal(clf.classes_, unique_labels(classes)):
- raise ValueError(
- "`classes=%r` is not the same as on last call "
- "to partial_fit, was: %r" % (classes, clf.classes_))
- else:
- # This is the first call to partial_fit
- clf.classes_ = unique_labels(classes)
- return True
- # classes is None and clf.classes_ has already previously been set:
- # nothing to do
- return False
- def class_distribution(y, sample_weight=None):
- """Compute class priors from multioutput-multiclass target data
- Parameters
- ----------
- y : array like or sparse matrix of size (n_samples, n_outputs)
- The labels for each example.
- sample_weight : array-like of shape = (n_samples,), optional
- Sample weights.
- Returns
- -------
- classes : list of size n_outputs of arrays of size (n_classes,)
- List of classes for each column.
- n_classes : list of integers of size n_outputs
- Number of classes in each column
- class_prior : list of size n_outputs of arrays of size (n_classes,)
- Class distribution of each column.
- """
- classes = []
- n_classes = []
- class_prior = []
- n_samples, n_outputs = y.shape
- if issparse(y):
- y = y.tocsc()
- y_nnz = np.diff(y.indptr)
- for k in range(n_outputs):
- col_nonzero = y.indices[y.indptr[k]:y.indptr[k + 1]]
- # separate sample weights for zero and non-zero elements
- if sample_weight is not None:
- nz_samp_weight = np.asarray(sample_weight)[col_nonzero]
- zeros_samp_weight_sum = (np.sum(sample_weight) -
- np.sum(nz_samp_weight))
- else:
- nz_samp_weight = None
- zeros_samp_weight_sum = y.shape[0] - y_nnz[k]
- classes_k, y_k = np.unique(y.data[y.indptr[k]:y.indptr[k + 1]],
- return_inverse=True)
- class_prior_k = bincount(y_k, weights=nz_samp_weight)
- # An explicit zero was found, combine its weight with the weight
- # of the implicit zeros
- if 0 in classes_k:
- class_prior_k[classes_k == 0] += zeros_samp_weight_sum
- # If an there is an implicit zero and it is not in classes and
- # class_prior, make an entry for it
- if 0 not in classes_k and y_nnz[k] < y.shape[0]:
- classes_k = np.insert(classes_k, 0, 0)
- class_prior_k = np.insert(class_prior_k, 0,
- zeros_samp_weight_sum)
- classes.append(classes_k)
- n_classes.append(classes_k.shape[0])
- class_prior.append(class_prior_k / class_prior_k.sum())
- else:
- for k in range(n_outputs):
- classes_k, y_k = np.unique(y[:, k], return_inverse=True)
- classes.append(classes_k)
- n_classes.append(classes_k.shape[0])
- class_prior_k = bincount(y_k, weights=sample_weight)
- class_prior.append(class_prior_k / class_prior_k.sum())
- return (classes, n_classes, class_prior)