/sklearn/utils/estimator_checks.py
Python | 3047 lines | 2528 code | 259 blank | 260 comment | 227 complexity | 75f5c0f5bd11b82772c418f6bdfe02bc MD5 | raw file
Possible License(s): BSD-3-Clause
Large files files are truncated, but you can click here to view the full file
- import types
- import warnings
- import sys
- import traceback
- import pickle
- import re
- from copy import deepcopy
- from functools import partial
- from itertools import chain
- from inspect import signature
- import numpy as np
- from scipy import sparse
- from scipy.stats import rankdata
- import joblib
- from . import IS_PYPY
- from .. import config_context
- from ._testing import assert_raises, _get_args
- from ._testing import assert_raises_regex
- from ._testing import assert_raise_message
- from ._testing import assert_array_equal
- from ._testing import assert_array_almost_equal
- from ._testing import assert_allclose
- from ._testing import assert_allclose_dense_sparse
- from ._testing import assert_warns_message
- from ._testing import set_random_state
- from ._testing import SkipTest
- from ._testing import ignore_warnings
- from ._testing import create_memmap_backed_data
- from . import is_scalar_nan
- from ..discriminant_analysis import LinearDiscriminantAnalysis
- from ..linear_model import Ridge
- from ..base import (clone, ClusterMixin, is_classifier, is_regressor,
- RegressorMixin, is_outlier_detector, BaseEstimator)
- from ..metrics import accuracy_score, adjusted_rand_score, f1_score
- from ..random_projection import BaseRandomProjection
- from ..feature_selection import SelectKBest
- from ..pipeline import make_pipeline
- from ..exceptions import DataConversionWarning
- from ..exceptions import NotFittedError
- from ..exceptions import SkipTestWarning
- from ..model_selection import train_test_split
- from ..model_selection import ShuffleSplit
- from ..model_selection._validation import _safe_split
- from ..metrics.pairwise import (rbf_kernel, linear_kernel, pairwise_distances)
- from .import shuffle
- from .import deprecated
- from .validation import has_fit_parameter, _num_samples
- from ..preprocessing import StandardScaler
- from ..datasets import (load_iris, load_boston, make_blobs,
- make_multilabel_classification, make_regression)
- BOSTON = None
- CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']
- def _yield_checks(name, estimator):
- tags = estimator._get_tags()
- yield check_no_attributes_set_in_init
- yield check_estimators_dtypes
- yield check_fit_score_takes_y
- yield check_sample_weights_pandas_series
- yield check_sample_weights_not_an_array
- yield check_sample_weights_list
- yield check_sample_weights_shape
- yield check_sample_weights_invariance
- yield check_estimators_fit_returns_self
- yield partial(check_estimators_fit_returns_self, readonly_memmap=True)
- # Check that all estimator yield informative messages when
- # trained on empty datasets
- if not tags["no_validation"]:
- yield check_complex_data
- yield check_dtype_object
- yield check_estimators_empty_data_messages
- if name not in CROSS_DECOMPOSITION:
- # cross-decomposition's "transform" returns X and Y
- yield check_pipeline_consistency
- if not tags["allow_nan"] and not tags["no_validation"]:
- # Test that all estimators check their input for NaN's and infs
- yield check_estimators_nan_inf
- if _is_pairwise(estimator):
- # Check that pairwise estimator throws error on non-square input
- yield check_nonsquare_error
- yield check_estimators_overwrite_params
- if hasattr(estimator, 'sparsify'):
- yield check_sparsify_coefficients
- yield check_estimator_sparse_data
- # Test that estimators can be pickled, and once pickled
- # give the same answer as before.
- yield check_estimators_pickle
- def _yield_classifier_checks(name, classifier):
- tags = classifier._get_tags()
- # test classifiers can handle non-array data and pandas objects
- yield check_classifier_data_not_an_array
- # test classifiers trained on a single label always return this label
- yield check_classifiers_one_label
- yield check_classifiers_classes
- yield check_estimators_partial_fit_n_features
- if tags["multioutput"]:
- yield check_classifier_multioutput
- # basic consistency testing
- yield check_classifiers_train
- yield partial(check_classifiers_train, readonly_memmap=True)
- yield partial(check_classifiers_train, readonly_memmap=True,
- X_dtype='float32')
- yield check_classifiers_regression_target
- if tags["multilabel"]:
- yield check_classifiers_multilabel_representation_invariance
- if not tags["no_validation"]:
- yield check_supervised_y_no_nan
- yield check_supervised_y_2d
- if tags["requires_fit"]:
- yield check_estimators_unfitted
- if 'class_weight' in classifier.get_params().keys():
- yield check_class_weight_classifiers
- yield check_non_transformer_estimators_n_iter
- # test if predict_proba is a monotonic transformation of decision_function
- yield check_decision_proba_consistency
- @ignore_warnings(category=FutureWarning)
- def check_supervised_y_no_nan(name, estimator_orig):
- # Checks that the Estimator targets are not NaN.
- estimator = clone(estimator_orig)
- rng = np.random.RandomState(888)
- X = rng.randn(10, 5)
- y = np.full(10, np.inf)
- y = _enforce_estimator_tags_y(estimator, y)
- errmsg = "Input contains NaN, infinity or a value too large for " \
- "dtype('float64')."
- try:
- estimator.fit(X, y)
- except ValueError as e:
- if str(e) != errmsg:
- raise ValueError("Estimator {0} raised error as expected, but "
- "does not match expected error message"
- .format(name))
- else:
- raise ValueError("Estimator {0} should have raised error on fitting "
- "array y with NaN value.".format(name))
- def _yield_regressor_checks(name, regressor):
- tags = regressor._get_tags()
- # TODO: test with intercept
- # TODO: test with multiple responses
- # basic testing
- yield check_regressors_train
- yield partial(check_regressors_train, readonly_memmap=True)
- yield partial(check_regressors_train, readonly_memmap=True,
- X_dtype='float32')
- yield check_regressor_data_not_an_array
- yield check_estimators_partial_fit_n_features
- if tags["multioutput"]:
- yield check_regressor_multioutput
- yield check_regressors_no_decision_function
- if not tags["no_validation"]:
- yield check_supervised_y_2d
- yield check_supervised_y_no_nan
- if name != 'CCA':
- # check that the regressor handles int input
- yield check_regressors_int
- if tags["requires_fit"]:
- yield check_estimators_unfitted
- yield check_non_transformer_estimators_n_iter
- def _yield_transformer_checks(name, transformer):
- # All transformers should either deal with sparse data or raise an
- # exception with type TypeError and an intelligible error message
- if not transformer._get_tags()["no_validation"]:
- yield check_transformer_data_not_an_array
- # these don't actually fit the data, so don't raise errors
- yield check_transformer_general
- yield partial(check_transformer_general, readonly_memmap=True)
- if not transformer._get_tags()["stateless"]:
- yield check_transformers_unfitted
- # Dependent on external solvers and hence accessing the iter
- # param is non-trivial.
- external_solver = ['Isomap', 'KernelPCA', 'LocallyLinearEmbedding',
- 'RandomizedLasso', 'LogisticRegressionCV']
- if name not in external_solver:
- yield check_transformer_n_iter
- def _yield_clustering_checks(name, clusterer):
- yield check_clusterer_compute_labels_predict
- if name not in ('WardAgglomeration', "FeatureAgglomeration"):
- # this is clustering on the features
- # let's not test that here.
- yield check_clustering
- yield partial(check_clustering, readonly_memmap=True)
- yield check_estimators_partial_fit_n_features
- yield check_non_transformer_estimators_n_iter
- def _yield_outliers_checks(name, estimator):
- # checks for outlier detectors that have a fit_predict method
- if hasattr(estimator, 'fit_predict'):
- yield check_outliers_fit_predict
- # checks for estimators that can be used on a test set
- if hasattr(estimator, 'predict'):
- yield check_outliers_train
- yield partial(check_outliers_train, readonly_memmap=True)
- # test outlier detectors can handle non-array data
- yield check_classifier_data_not_an_array
- # test if NotFittedError is raised
- if estimator._get_tags()["requires_fit"]:
- yield check_estimators_unfitted
- def _yield_all_checks(name, estimator):
- tags = estimator._get_tags()
- if "2darray" not in tags["X_types"]:
- warnings.warn("Can't test estimator {} which requires input "
- " of type {}".format(name, tags["X_types"]),
- SkipTestWarning)
- return
- if tags["_skip_test"]:
- warnings.warn("Explicit SKIP via _skip_test tag for estimator "
- "{}.".format(name),
- SkipTestWarning)
- return
- for check in _yield_checks(name, estimator):
- yield check
- if is_classifier(estimator):
- for check in _yield_classifier_checks(name, estimator):
- yield check
- if is_regressor(estimator):
- for check in _yield_regressor_checks(name, estimator):
- yield check
- if hasattr(estimator, 'transform'):
- for check in _yield_transformer_checks(name, estimator):
- yield check
- if isinstance(estimator, ClusterMixin):
- for check in _yield_clustering_checks(name, estimator):
- yield check
- if is_outlier_detector(estimator):
- for check in _yield_outliers_checks(name, estimator):
- yield check
- yield check_fit2d_predict1d
- yield check_methods_subset_invariance
- yield check_fit2d_1sample
- yield check_fit2d_1feature
- yield check_fit1d
- yield check_get_params_invariance
- yield check_set_params
- yield check_dict_unchanged
- yield check_dont_overwrite_parameters
- yield check_fit_idempotent
- if not tags["no_validation"]:
- yield check_n_features_in
- if tags["requires_y"]:
- yield check_requires_y_none
- if tags["requires_positive_X"]:
- yield check_fit_non_negative
- def _set_check_estimator_ids(obj):
- """Create pytest ids for checks.
- When `obj` is an estimator, this returns the pprint version of the
- estimator (with `print_changed_only=True`). When `obj` is a function, the
- name of the function is returned with its keyworld arguments.
- `_set_check_estimator_ids` is designed to be used as the `id` in
- `pytest.mark.parametrize` where `check_estimator(..., generate_only=True)`
- is yielding estimators and checks.
- Parameters
- ----------
- obj : estimator or function
- Items generated by `check_estimator`
- Returns
- -------
- id : string or None
- See also
- --------
- check_estimator
- """
- if callable(obj):
- if not isinstance(obj, partial):
- return obj.__name__
- if not obj.keywords:
- return obj.func.__name__
- kwstring = ",".join(["{}={}".format(k, v)
- for k, v in obj.keywords.items()])
- return "{}({})".format(obj.func.__name__, kwstring)
- if hasattr(obj, "get_params"):
- with config_context(print_changed_only=True):
- return re.sub(r"\s", "", str(obj))
- def _construct_instance(Estimator):
- """Construct Estimator instance if possible"""
- required_parameters = getattr(Estimator, "_required_parameters", [])
- if len(required_parameters):
- if required_parameters in (["estimator"], ["base_estimator"]):
- if issubclass(Estimator, RegressorMixin):
- estimator = Estimator(Ridge())
- else:
- estimator = Estimator(LinearDiscriminantAnalysis())
- else:
- raise SkipTest("Can't instantiate estimator {} which requires "
- "parameters {}".format(Estimator.__name__,
- required_parameters))
- else:
- estimator = Estimator()
- return estimator
- # TODO: probably not needed anymore in 0.24 since _generate_class_checks should
- # be removed too. Just put this in check_estimator()
- def _generate_instance_checks(name, estimator):
- """Generate instance checks."""
- yield from ((estimator, partial(check, name))
- for check in _yield_all_checks(name, estimator))
- # TODO: remove this in 0.24
- def _generate_class_checks(Estimator):
- """Generate class checks."""
- name = Estimator.__name__
- yield (Estimator, partial(check_parameters_default_constructible, name))
- estimator = _construct_instance(Estimator)
- yield from _generate_instance_checks(name, estimator)
- def _mark_xfail_checks(estimator, check, pytest):
- """Mark (estimator, check) pairs with xfail according to the
- _xfail_checks_ tag"""
- if isinstance(estimator, type):
- # try to construct estimator instance, if it is unable to then
- # return the estimator class, ignoring the tag
- # TODO: remove this if block in 0.24 since passing instances isn't
- # supported anymore
- try:
- estimator = _construct_instance(estimator)
- except Exception:
- return estimator, check
- xfail_checks = estimator._get_tags()['_xfail_checks'] or {}
- check_name = _set_check_estimator_ids(check)
- if check_name not in xfail_checks:
- # check isn't part of the xfail_checks tags, just return it
- return estimator, check
- else:
- # check is in the tag, mark it as xfail for pytest
- reason = xfail_checks[check_name]
- return pytest.param(estimator, check,
- marks=pytest.mark.xfail(reason=reason))
- def parametrize_with_checks(estimators):
- """Pytest specific decorator for parametrizing estimator checks.
- The `id` of each check is set to be a pprint version of the estimator
- and the name of the check with its keyword arguments.
- This allows to use `pytest -k` to specify which tests to run::
- pytest test_check_estimators.py -k check_estimators_fit_returns_self
- Parameters
- ----------
- estimators : list of estimators objects or classes
- Estimators to generated checks for.
- .. deprecated:: 0.23
- Passing a class is deprecated from version 0.23, and won't be
- supported in 0.24. Pass an instance instead.
- Returns
- -------
- decorator : `pytest.mark.parametrize`
- Examples
- --------
- >>> from sklearn.utils.estimator_checks import parametrize_with_checks
- >>> from sklearn.linear_model import LogisticRegression
- >>> from sklearn.tree import DecisionTreeRegressor
- >>> @parametrize_with_checks([LogisticRegression(),
- ... DecisionTreeRegressor()])
- ... def test_sklearn_compatible_estimator(estimator, check):
- ... check(estimator)
- """
- import pytest
- if any(isinstance(est, type) for est in estimators):
- # TODO: remove class support in 0.24 and update docstrings
- msg = ("Passing a class is deprecated since version 0.23 "
- "and won't be supported in 0.24."
- "Please pass an instance instead.")
- warnings.warn(msg, FutureWarning)
- checks_generator = chain.from_iterable(
- check_estimator(estimator, generate_only=True)
- for estimator in estimators)
- checks_with_marks = (
- _mark_xfail_checks(estimator, check, pytest)
- for estimator, check in checks_generator)
- return pytest.mark.parametrize("estimator, check", checks_with_marks,
- ids=_set_check_estimator_ids)
- def check_estimator(Estimator, generate_only=False):
- """Check if estimator adheres to scikit-learn conventions.
- This estimator will run an extensive test-suite for input validation,
- shapes, etc, making sure that the estimator complies with `scikit-learn`
- conventions as detailed in :ref:`rolling_your_own_estimator`.
- Additional tests for classifiers, regressors, clustering or transformers
- will be run if the Estimator class inherits from the corresponding mixin
- from sklearn.base.
- This test can be applied to classes or instances.
- Classes currently have some additional tests that related to construction,
- while passing instances allows the testing of multiple options. However,
- support for classes is deprecated since version 0.23 and will be removed
- in version 0.24 (class checks will still be run on the instances).
- Setting `generate_only=True` returns a generator that yields (estimator,
- check) tuples where the check can be called independently from each
- other, i.e. `check(estimator)`. This allows all checks to be run
- independently and report the checks that are failing.
- scikit-learn provides a pytest specific decorator,
- :func:`~sklearn.utils.parametrize_with_checks`, making it easier to test
- multiple estimators.
- Parameters
- ----------
- estimator : estimator object
- Estimator to check. Estimator is a class object or instance.
- .. deprecated:: 0.23
- Passing a class is deprecated from version 0.23, and won't be
- supported in 0.24. Pass an instance instead.
- generate_only : bool, optional (default=False)
- When `False`, checks are evaluated when `check_estimator` is called.
- When `True`, `check_estimator` returns a generator that yields
- (estimator, check) tuples. The check is run by calling
- `check(estimator)`.
- .. versionadded:: 0.22
- Returns
- -------
- checks_generator : generator
- Generator that yields (estimator, check) tuples. Returned when
- `generate_only=True`.
- """
- # TODO: remove class support in 0.24 and update docstrings
- if isinstance(Estimator, type):
- # got a class
- msg = ("Passing a class is deprecated since version 0.23 "
- "and won't be supported in 0.24."
- "Please pass an instance instead.")
- warnings.warn(msg, FutureWarning)
- checks_generator = _generate_class_checks(Estimator)
- else:
- # got an instance
- estimator = Estimator
- name = type(estimator).__name__
- checks_generator = _generate_instance_checks(name, estimator)
- if generate_only:
- return checks_generator
- for estimator, check in checks_generator:
- try:
- check(estimator)
- except SkipTest as exception:
- # the only SkipTest thrown currently results from not
- # being able to import pandas.
- warnings.warn(str(exception), SkipTestWarning)
- def _boston_subset(n_samples=200):
- global BOSTON
- if BOSTON is None:
- X, y = load_boston(return_X_y=True)
- X, y = shuffle(X, y, random_state=0)
- X, y = X[:n_samples], y[:n_samples]
- X = StandardScaler().fit_transform(X)
- BOSTON = X, y
- return BOSTON
- @deprecated("set_checking_parameters is deprecated in version "
- "0.22 and will be removed in version 0.24.")
- def set_checking_parameters(estimator):
- _set_checking_parameters(estimator)
- def _set_checking_parameters(estimator):
- # set parameters to speed up some estimators and
- # avoid deprecated behaviour
- params = estimator.get_params()
- name = estimator.__class__.__name__
- if ("n_iter" in params and name != "TSNE"):
- estimator.set_params(n_iter=5)
- if "max_iter" in params:
- if estimator.max_iter is not None:
- estimator.set_params(max_iter=min(5, estimator.max_iter))
- # LinearSVR, LinearSVC
- if estimator.__class__.__name__ in ['LinearSVR', 'LinearSVC']:
- estimator.set_params(max_iter=20)
- # NMF
- if estimator.__class__.__name__ == 'NMF':
- estimator.set_params(max_iter=100)
- # MLP
- if estimator.__class__.__name__ in ['MLPClassifier', 'MLPRegressor']:
- estimator.set_params(max_iter=100)
- if "n_resampling" in params:
- # randomized lasso
- estimator.set_params(n_resampling=5)
- if "n_estimators" in params:
- estimator.set_params(n_estimators=min(5, estimator.n_estimators))
- if "max_trials" in params:
- # RANSAC
- estimator.set_params(max_trials=10)
- if "n_init" in params:
- # K-Means
- estimator.set_params(n_init=2)
- if name == 'TruncatedSVD':
- # TruncatedSVD doesn't run with n_components = n_features
- # This is ugly :-/
- estimator.n_components = 1
- if hasattr(estimator, "n_clusters"):
- estimator.n_clusters = min(estimator.n_clusters, 2)
- if hasattr(estimator, "n_best"):
- estimator.n_best = 1
- if name == "SelectFdr":
- # be tolerant of noisy datasets (not actually speed)
- estimator.set_params(alpha=.5)
- if name == "TheilSenRegressor":
- estimator.max_subpopulation = 100
- if isinstance(estimator, BaseRandomProjection):
- # Due to the jl lemma and often very few samples, the number
- # of components of the random matrix projection will be probably
- # greater than the number of features.
- # So we impose a smaller number (avoid "auto" mode)
- estimator.set_params(n_components=2)
- if isinstance(estimator, SelectKBest):
- # SelectKBest has a default of k=10
- # which is more feature than we have in most case.
- estimator.set_params(k=1)
- if name in ('HistGradientBoostingClassifier',
- 'HistGradientBoostingRegressor'):
- # The default min_samples_leaf (20) isn't appropriate for small
- # datasets (only very shallow trees are built) that the checks use.
- estimator.set_params(min_samples_leaf=5)
- # Speed-up by reducing the number of CV or splits for CV estimators
- loo_cv = ['RidgeCV']
- if name not in loo_cv and hasattr(estimator, 'cv'):
- estimator.set_params(cv=3)
- if hasattr(estimator, 'n_splits'):
- estimator.set_params(n_splits=3)
- if name == 'OneHotEncoder':
- estimator.set_params(handle_unknown='ignore')
- class _NotAnArray:
- """An object that is convertible to an array
- Parameters
- ----------
- data : array_like
- The data.
- """
- def __init__(self, data):
- self.data = np.asarray(data)
- def __array__(self, dtype=None):
- return self.data
- def __array_function__(self, func, types, args, kwargs):
- if func.__name__ == "may_share_memory":
- return True
- raise TypeError("Don't want to call array_function {}!".format(
- func.__name__))
- @deprecated("NotAnArray is deprecated in version "
- "0.22 and will be removed in version 0.24.")
- class NotAnArray(_NotAnArray):
- # TODO: remove in 0.24
- pass
- def _is_pairwise(estimator):
- """Returns True if estimator has a _pairwise attribute set to True.
- Parameters
- ----------
- estimator : object
- Estimator object to test.
- Returns
- -------
- out : bool
- True if _pairwise is set to True and False otherwise.
- """
- return bool(getattr(estimator, "_pairwise", False))
- def _is_pairwise_metric(estimator):
- """Returns True if estimator accepts pairwise metric.
- Parameters
- ----------
- estimator : object
- Estimator object to test.
- Returns
- -------
- out : bool
- True if _pairwise is set to True and False otherwise.
- """
- metric = getattr(estimator, "metric", None)
- return bool(metric == 'precomputed')
- @deprecated("pairwise_estimator_convert_X is deprecated in version "
- "0.22 and will be removed in version 0.24.")
- def pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel):
- return _pairwise_estimator_convert_X(X, estimator, kernel)
- def _pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel):
- if _is_pairwise_metric(estimator):
- return pairwise_distances(X, metric='euclidean')
- if _is_pairwise(estimator):
- return kernel(X, X)
- return X
- def _generate_sparse_matrix(X_csr):
- """Generate sparse matrices with {32,64}bit indices of diverse format
- Parameters
- ----------
- X_csr: CSR Matrix
- Input matrix in CSR format
- Returns
- -------
- out: iter(Matrices)
- In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo',
- 'coo_64', 'csc_64', 'csr_64']
- """
- assert X_csr.format == 'csr'
- yield 'csr', X_csr.copy()
- for sparse_format in ['dok', 'lil', 'dia', 'bsr', 'csc', 'coo']:
- yield sparse_format, X_csr.asformat(sparse_format)
- # Generate large indices matrix only if its supported by scipy
- X_coo = X_csr.asformat('coo')
- X_coo.row = X_coo.row.astype('int64')
- X_coo.col = X_coo.col.astype('int64')
- yield "coo_64", X_coo
- for sparse_format in ['csc', 'csr']:
- X = X_csr.asformat(sparse_format)
- X.indices = X.indices.astype('int64')
- X.indptr = X.indptr.astype('int64')
- yield sparse_format + "_64", X
- def check_estimator_sparse_data(name, estimator_orig):
- rng = np.random.RandomState(0)
- X = rng.rand(40, 10)
- X[X < .8] = 0
- X = _pairwise_estimator_convert_X(X, estimator_orig)
- X_csr = sparse.csr_matrix(X)
- tags = estimator_orig._get_tags()
- if tags['binary_only']:
- y = (2 * rng.rand(40)).astype(np.int)
- else:
- y = (4 * rng.rand(40)).astype(np.int)
- # catch deprecation warnings
- with ignore_warnings(category=FutureWarning):
- estimator = clone(estimator_orig)
- y = _enforce_estimator_tags_y(estimator, y)
- for matrix_format, X in _generate_sparse_matrix(X_csr):
- # catch deprecation warnings
- with ignore_warnings(category=FutureWarning):
- estimator = clone(estimator_orig)
- if name in ['Scaler', 'StandardScaler']:
- estimator.set_params(with_mean=False)
- # fit and predict
- try:
- with ignore_warnings(category=FutureWarning):
- estimator.fit(X, y)
- if hasattr(estimator, "predict"):
- pred = estimator.predict(X)
- if tags['multioutput_only']:
- assert pred.shape == (X.shape[0], 1)
- else:
- assert pred.shape == (X.shape[0],)
- if hasattr(estimator, 'predict_proba'):
- probs = estimator.predict_proba(X)
- if tags['binary_only']:
- expected_probs_shape = (X.shape[0], 2)
- else:
- expected_probs_shape = (X.shape[0], 4)
- assert probs.shape == expected_probs_shape
- except (TypeError, ValueError) as e:
- if 'sparse' not in repr(e).lower():
- if "64" in matrix_format:
- msg = ("Estimator %s doesn't seem to support %s matrix, "
- "and is not failing gracefully, e.g. by using "
- "check_array(X, accept_large_sparse=False)")
- raise AssertionError(msg % (name, matrix_format))
- else:
- print("Estimator %s doesn't seem to fail gracefully on "
- "sparse data: error message state explicitly that "
- "sparse input is not supported if this is not"
- " the case." % name)
- raise
- except Exception:
- print("Estimator %s doesn't seem to fail gracefully on "
- "sparse data: it should raise a TypeError if sparse input "
- "is explicitly not supported." % name)
- raise
- @ignore_warnings(category=FutureWarning)
- def check_sample_weights_pandas_series(name, estimator_orig):
- # check that estimators will accept a 'sample_weight' parameter of
- # type pandas.Series in the 'fit' function.
- estimator = clone(estimator_orig)
- if has_fit_parameter(estimator, "sample_weight"):
- try:
- import pandas as pd
- X = np.array([[1, 1], [1, 2], [1, 3], [1, 4],
- [2, 1], [2, 2], [2, 3], [2, 4],
- [3, 1], [3, 2], [3, 3], [3, 4]])
- X = pd.DataFrame(_pairwise_estimator_convert_X(X, estimator_orig))
- y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
- weights = pd.Series([1] * 12)
- if estimator._get_tags()["multioutput_only"]:
- y = pd.DataFrame(y)
- try:
- estimator.fit(X, y, sample_weight=weights)
- except ValueError:
- raise ValueError("Estimator {0} raises error if "
- "'sample_weight' parameter is of "
- "type pandas.Series".format(name))
- except ImportError:
- raise SkipTest("pandas is not installed: not testing for "
- "input of type pandas.Series to class weight.")
- @ignore_warnings(category=(FutureWarning))
- def check_sample_weights_not_an_array(name, estimator_orig):
- # check that estimators will accept a 'sample_weight' parameter of
- # type _NotAnArray in the 'fit' function.
- estimator = clone(estimator_orig)
- if has_fit_parameter(estimator, "sample_weight"):
- X = np.array([[1, 1], [1, 2], [1, 3], [1, 4],
- [2, 1], [2, 2], [2, 3], [2, 4],
- [3, 1], [3, 2], [3, 3], [3, 4]])
- X = _NotAnArray(pairwise_estimator_convert_X(X, estimator_orig))
- y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
- weights = _NotAnArray([1] * 12)
- if estimator._get_tags()["multioutput_only"]:
- y = _NotAnArray(y.data.reshape(-1, 1))
- estimator.fit(X, y, sample_weight=weights)
- @ignore_warnings(category=(FutureWarning))
- def check_sample_weights_list(name, estimator_orig):
- # check that estimators will accept a 'sample_weight' parameter of
- # type list in the 'fit' function.
- if has_fit_parameter(estimator_orig, "sample_weight"):
- estimator = clone(estimator_orig)
- rnd = np.random.RandomState(0)
- n_samples = 30
- X = _pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)),
- estimator_orig)
- if estimator._get_tags()['binary_only']:
- y = np.arange(n_samples) % 2
- else:
- y = np.arange(n_samples) % 3
- y = _enforce_estimator_tags_y(estimator, y)
- sample_weight = [3] * n_samples
- # Test that estimators don't raise any exception
- estimator.fit(X, y, sample_weight=sample_weight)
- @ignore_warnings(category=FutureWarning)
- def check_sample_weights_shape(name, estimator_orig):
- # check that estimators raise an error if sample_weight
- # shape mismatches the input
- if (has_fit_parameter(estimator_orig, "sample_weight") and
- not (hasattr(estimator_orig, "_pairwise")
- and estimator_orig._pairwise)):
- estimator = clone(estimator_orig)
- X = np.array([[1, 3], [1, 3], [1, 3], [1, 3],
- [2, 1], [2, 1], [2, 1], [2, 1],
- [3, 3], [3, 3], [3, 3], [3, 3],
- [4, 1], [4, 1], [4, 1], [4, 1]])
- y = np.array([1, 1, 1, 1, 2, 2, 2, 2,
- 1, 1, 1, 1, 2, 2, 2, 2])
- y = _enforce_estimator_tags_y(estimator, y)
- estimator.fit(X, y, sample_weight=np.ones(len(y)))
- assert_raises(ValueError, estimator.fit, X, y,
- sample_weight=np.ones(2*len(y)))
- assert_raises(ValueError, estimator.fit, X, y,
- sample_weight=np.ones((len(y), 2)))
- @ignore_warnings(category=FutureWarning)
- def check_sample_weights_invariance(name, estimator_orig):
- # check that the estimators yield same results for
- # unit weights and no weights
- if (has_fit_parameter(estimator_orig, "sample_weight") and
- not (hasattr(estimator_orig, "_pairwise")
- and estimator_orig._pairwise)):
- # We skip pairwise because the data is not pairwise
- estimator1 = clone(estimator_orig)
- estimator2 = clone(estimator_orig)
- set_random_state(estimator1, random_state=0)
- set_random_state(estimator2, random_state=0)
- X = np.array([[1, 3], [1, 3], [1, 3], [1, 3],
- [2, 1], [2, 1], [2, 1], [2, 1],
- [3, 3], [3, 3], [3, 3], [3, 3],
- [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float'))
- y = np.array([1, 1, 1, 1, 2, 2, 2, 2,
- 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int'))
- y = _enforce_estimator_tags_y(estimator1, y)
- estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y)))
- estimator2.fit(X, y=y, sample_weight=None)
- for method in ["predict", "transform"]:
- if hasattr(estimator_orig, method):
- X_pred1 = getattr(estimator1, method)(X)
- X_pred2 = getattr(estimator2, method)(X)
- if sparse.issparse(X_pred1):
- X_pred1 = X_pred1.toarray()
- X_pred2 = X_pred2.toarray()
- assert_allclose(X_pred1, X_pred2,
- err_msg="For %s sample_weight=None is not"
- " equivalent to sample_weight=ones"
- % name)
- @ignore_warnings(category=(FutureWarning, UserWarning))
- def check_dtype_object(name, estimator_orig):
- # check that estimators treat dtype object as numeric if possible
- rng = np.random.RandomState(0)
- X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig)
- X = X.astype(object)
- tags = estimator_orig._get_tags()
- if tags['binary_only']:
- y = (X[:, 0] * 2).astype(np.int)
- else:
- y = (X[:, 0] * 4).astype(np.int)
- estimator = clone(estimator_orig)
- y = _enforce_estimator_tags_y(estimator, y)
- estimator.fit(X, y)
- if hasattr(estimator, "predict"):
- estimator.predict(X)
- if hasattr(estimator, "transform"):
- estimator.transform(X)
- try:
- estimator.fit(X, y.astype(object))
- except Exception as e:
- if "Unknown label type" not in str(e):
- raise
- if 'string' not in tags['X_types']:
- X[0, 0] = {'foo': 'bar'}
- msg = "argument must be a string.* number"
- assert_raises_regex(TypeError, msg, estimator.fit, X, y)
- else:
- # Estimators supporting string will not call np.asarray to convert the
- # data to numeric and therefore, the error will not be raised.
- # Checking for each element dtype in the input array will be costly.
- # Refer to #11401 for full discussion.
- estimator.fit(X, y)
- def check_complex_data(name, estimator_orig):
- # check that estimators raise an exception on providing complex data
- X = np.random.sample(10) + 1j * np.random.sample(10)
- X = X.reshape(-1, 1)
- y = np.random.sample(10) + 1j * np.random.sample(10)
- estimator = clone(estimator_orig)
- assert_raises_regex(ValueError, "Complex data not supported",
- estimator.fit, X, y)
- @ignore_warnings
- def check_dict_unchanged(name, estimator_orig):
- # this estimator raises
- # ValueError: Found array with 0 feature(s) (shape=(23, 0))
- # while a minimum of 1 is required.
- # error
- if name in ['SpectralCoclustering']:
- return
- rnd = np.random.RandomState(0)
- if name in ['RANSACRegressor']:
- X = 3 * rnd.uniform(size=(20, 3))
- else:
- X = 2 * rnd.uniform(size=(20, 3))
- X = _pairwise_estimator_convert_X(X, estimator_orig)
- y = X[:, 0].astype(np.int)
- estimator = clone(estimator_orig)
- y = _enforce_estimator_tags_y(estimator, y)
- if hasattr(estimator, "n_components"):
- estimator.n_components = 1
- if hasattr(estimator, "n_clusters"):
- estimator.n_clusters = 1
- if hasattr(estimator, "n_best"):
- estimator.n_best = 1
- set_random_state(estimator, 1)
- estimator.fit(X, y)
- for method in ["predict", "transform", "decision_function",
- "predict_proba"]:
- if hasattr(estimator, method):
- dict_before = estimator.__dict__.copy()
- getattr(estimator, method)(X)
- assert estimator.__dict__ == dict_before, (
- 'Estimator changes __dict__ during %s' % method)
- @deprecated("is_public_parameter is deprecated in version "
- "0.22 and will be removed in version 0.24.")
- def is_public_parameter(attr):
- return _is_public_parameter(attr)
- def _is_public_parameter(attr):
- return not (attr.startswith('_') or attr.endswith('_'))
- @ignore_warnings(category=FutureWarning)
- def check_dont_overwrite_parameters(name, estimator_orig):
- # check that fit method only changes or sets private attributes
- if hasattr(estimator_orig.__init__, "deprecated_original"):
- # to not check deprecated classes
- return
- estimator = clone(estimator_orig)
- rnd = np.random.RandomState(0)
- X = 3 * rnd.uniform(size=(20, 3))
- X = _pairwise_estimator_convert_X(X, estimator_orig)
- y = X[:, 0].astype(np.int)
- if estimator._get_tags()['binary_only']:
- y[y == 2] = 1
- y = _enforce_estimator_tags_y(estimator, y)
- if hasattr(estimator, "n_components"):
- estimator.n_components = 1
- if hasattr(estimator, "n_clusters"):
- estimator.n_clusters = 1
- set_random_state(estimator, 1)
- dict_before_fit = estimator.__dict__.copy()
- estimator.fit(X, y)
- dict_after_fit = estimator.__dict__
- public_keys_after_fit = [key for key in dict_after_fit.keys()
- if _is_public_parameter(key)]
- attrs_added_by_fit = [key for key in public_keys_after_fit
- if key not in dict_before_fit.keys()]
- # check that fit doesn't add any public attribute
- assert not attrs_added_by_fit, (
- 'Estimator adds public attribute(s) during'
- ' the fit method.'
- ' Estimators are only allowed to add private attributes'
- ' either started with _ or ended'
- ' with _ but %s added'
- % ', '.join(attrs_added_by_fit))
- # check that fit doesn't change any public attribute
- attrs_changed_by_fit = [key for key in public_keys_after_fit
- if (dict_before_fit[key]
- is not dict_after_fit[key])]
- assert not attrs_changed_by_fit, (
- 'Estimator changes public attribute(s) during'
- ' the fit method. Estimators are only allowed'
- ' to change attributes started'
- ' or ended with _, but'
- ' %s changed'
- % ', '.join(attrs_changed_by_fit))
- @ignore_warnings(category=FutureWarning)
- def check_fit2d_predict1d(name, estimator_orig):
- # check by fitting a 2d array and predicting with a 1d array
- rnd = np.random.RandomState(0)
- X = 3 * rnd.uniform(size=(20, 3))
- X = _pairwise_estimator_convert_X(X, estimator_orig)
- y = X[:, 0].astype(np.int)
- tags = estimator_orig._get_tags()
- if tags['binary_only']:
- y[y == 2] = 1
- estimator = clone(estimator_orig)
- y = _enforce_estimator_tags_y(estimator, y)
- if hasattr(estimator, "n_components"):
- estimator.n_components = 1
- if hasattr(estimator, "n_clusters"):
- estimator.n_clusters = 1
- set_random_state(estimator, 1)
- estimator.fit(X, y)
- if tags["no_validation"]:
- # FIXME this is a bit loose
- return
- for method in ["predict", "transform", "decision_function",
- "predict_proba"]:
- if hasattr(estimator, method):
- assert_raise_message(ValueError, "Reshape your data",
- getattr(estimator, method), X[0])
- def _apply_on_subsets(func, X):
- # apply function on the whole set and on mini batches
- result_full = func(X)
- n_features = X.shape[1]
- result_by_batch = [func(batch.reshape(1, n_features))
- for batch in X]
- # func can output tuple (e.g. score_samples)
- if type(result_full) == tuple:
- result_full = result_full[0]
- result_by_batch = list(map(lambda x: x[0], result_by_batch))
- if sparse.issparse(result_full):
- result_full = result_full.A
- result_by_batch = [x.A for x in result_by_batch]
- return np.ravel(result_full), np.ravel(result_by_batch)
- @ignore_warnings(category=FutureWarning)
- def check_methods_subset_invariance(name, estimator_orig):
- # check that method gives invariant results if applied
- # on mini batches or the whole set
- rnd = np.random.RandomState(0)
- X = 3 * rnd.uniform(size=(20, 3))
- X = _pairwise_estimator_convert_X(X, estimator_orig)
- y = X[:, 0].astype(np.int)
- if estimator_orig._get_tags()['binary_only']:
- y[y == 2] = 1
- estimator = clone(estimator_orig)
- y = _enforce_estimator_tags_y(estimator, y)
- if hasattr(estimator, "n_components"):
- estimator.n_components = 1
- if hasattr(estimator, "n_clusters"):
- estimator.n_clusters = 1
- set_random_state(estimator, 1)
- estimator.fit(X, y)
- for method in ["predict", "transform", "decision_function",
- "score_samples", "predict_proba"]:
- msg = ("{method} of {name} is not invariant when applied "
- "to a subset.").format(method=method, name=name)
- if hasattr(estimator, method):
- result_full, result_by_batch = _apply_on_subsets(
- getattr(estimator, method), X)
- assert_allclose(result_full, result_by_batch,
- atol=1e-7, err_msg=msg)
- @ignore_warnings
- def check_fit2d_1sample(name, estimator_orig):
- # Check that fitting a 2d array with only one sample either works or
- # returns an informative message. The error message should either mention
- # the number of samples or the number of classes.
- rnd = np.random.RandomState(0)
- X = 3 * rnd.uniform(size=(1, 10))
- X = _pairwise_estimator_convert_X(X, estimator_orig)
- y = X[:, 0].astype(np.int)
- estimator = clone(estimator_orig)
- y = _enforce_estimator_tags_y(estimator, y)
- if hasattr(estimator, "n_components"):
- estimator.n_components = 1
- if hasattr(estimator, "n_clusters"):
- estimator.n_clusters = 1
- set_random_state(estimator, 1)
- # min_cluster_size cannot be less than the data size for OPTICS.
- if name == 'OPTICS':
- estimator.set_params(min_samples=1)
- msgs = ["1 sample", "n_samples = 1", "n_samples=1", "one sample",
- "1 class", "one class"]
- try:
- estimator.fit(X, y)
- except ValueError as e:
- if all(msg not in repr(e) for msg in msgs):
- raise e
- @ignore_warnings
- def check_fit2d_1feature(name, estimator_orig):
- # check fitting a 2d array with only 1 feature either works or returns
- # informative message
- rnd = np.random.RandomState(0)
- X = 3 * rnd.uniform(size=(10, 1))
- X = _pairwise_estimator_convert_X(X, estimator_orig)
- y = X[:, 0].astype(np.int)
- estimator = clone(estimator_orig)
- y = _enforce_estimator_tags_y(estimator, y)
- if hasattr(estimator, "n_components"):
- estimator.n_components = 1
- if hasattr(estimator, "n_clusters"):
- estimator.n_clusters = 1
- # ensure two labels in subsample for RandomizedLogisticRegression
- if name == 'RandomizedLogisticRegression':
- estimator.sample_fraction = 1
- # ensure non skipped trials for RANSACRegressor
- if name == 'RANSACRegressor':
- estimator.residual_threshold = 0.5
- y = _enforce_estimator_tags_y(estimator, y)
- set_random_state(estimator, 1)
- msgs = ["1 feature(s)", "n_features = 1", "n_features=1"]
- try:
- estimator.fit(X, y)
- except ValueError as e:
- if all(msg not in repr(e) for msg in msgs):
- raise e
- @ignore_warnings
- def check_fit1d(name, estimator_orig):
- # check fitting 1d X array raises a ValueError
- rnd = np.random.RandomState(0)
- X = 3 * rnd.uniform(size=(20))
- y = X.astype(np.int)
- estimator = clone(estimator_orig)
- tags = estimator._get_tags()
- if tags["no_validation"]:
- # FIXME this is a bit loose
- return
- y = _enforce_estimator_tags_y(estimator, y)
- if hasattr(estimator, "n_components"):
- estimator.n_components = 1
- if hasattr(estimator, "n_clusters"):
- estimator.n_clusters = 1
- set_random_state(estimator, 1)
- assert_raises(ValueError, estimator.fit, X, y)
- @ignore_warnings(category=FutureWarning)
- def check_transformer_general(name, transformer, readonly_memmap=False):
- X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
- random_state=0, n_features=2, cluster_std=0.1)
- X = StandardScaler().fit_transform(X)
- X -= X.min()
- X = _pairwise_estimator_convert_X(X, transformer)
- if readonly_memmap:
- X, y = create_memmap_backed_data([X, y])
- _check_transformer(name, transformer, X, y)
- @ignore_warnings(category=FutureWarning)
- def check_transformer_data_not_an_array(name, transformer):
- X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
- random_state=0, n_features=2, cluster_std=0.1)
- X = StandardScaler().fit_transform(X)
- # We need to make sure that we have non negative data, for things
- # like NMF
- X -= X.min() - .1
- X = _pairwise_estimator_convert_X(X, transformer)
- this_X = _NotAnArray(X)
- this_y = _NotAnArray(np.asarray(y))
- _check_transformer(name, transformer, this_X, this_y)
- # try the same with some list
- _check_transformer(name, transformer, X.tolist(), y.tolist())
- @ignore_warnings(category=FutureWarning)
- def check_transformers_unfitted(name, transformer):
- X, y = _boston_subset()
- transformer = clone(transformer)
- with assert_raises((AttributeError, ValueError), msg="The unfitted "
- "transformer {} does not raise an error when "
- "transform is called. Perhaps use "
- "check_is_fitted in transform.".format(name)):
- transformer.transform(X)
- def _check_transformer(name, transformer_orig, X, y):
- n_samples, n_features = np.asarray(X).shape
- transformer = clone(transformer_orig)
- set_random_state(transformer)
- # fit
- if name in CROSS_DECOMPOSITION:
- y_ = np.c_[np.asarray(y), np.asarray(y)]
- y_[::2, 1] *= 2
- if isinstance(X, _NotAnArray):
- y_ = _NotAnArray(y_)
- else:
- y_ = y
- transformer.fit(X, y_)
- # fit_transform method should work on non fitted estimator
- transformer_clone = clone(transformer)
- X_pred = transformer_clone.fit_transform(X, y=y_)
- if isinstance(X_pred, tuple):
- for x_pred in X_pred:
- assert x_pred.shape[0] == n_samples
- else:
- # check for consistent n_samples
- assert X_pred.shape[0] == n_samples
- if hasattr(transformer, 'transform'):
- if name in CROSS_DECOMPOSITION:
- X_pred2 = transformer.transform(X, y_)
- X_pred3 = transformer.fit_transform(X, y=y_)
- else:
- X_pred2 = transformer.transform(X)
- X_pred3 = transformer.fit_transform(X, y=y_)
- if transformer_orig._get_tags()['non_deterministic']:
- msg = name + ' is non deterministic'
- raise SkipTest(msg)
- if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
- for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
- assert_allclose_dense_sparse(
- x_pred, x_pred2, atol=1e-2,
- err_msg="fit_transform and transform outcomes "
- "not consistent in %s"
- % transformer)
- assert_allclose_dense_sparse(
- x_pred, x_pred3, atol=1e-2,
- err_msg="consecutive fit_transform outcomes "
- "not consistent in %s"
- % transformer)
- else:
- assert_allclose_dense_sparse(
- X_pred, X_pred2,
- err_msg="fit_transform and transform outcomes "
- "not consistent in %s"
- % transformer, atol=1e-2)
- assert_allclose_dense_sparse(
- X_pred, X_pred3, atol=1e-2,
- err_msg="consecutive fit_transform outcomes "
- "not consistent in %s"
- % transformer)
- assert _num_samples(X_pred2) == n_samples
- assert _num_samples(X_pred3) == n_samples
- # raises error on malformed input for transform
- if hasattr(X, 'shape') and \
- not transformer._get_tags()["stateless"] and \
- X.ndim == 2 and X.shape[1] > 1:
- # If it's not an array, it does not have a 'T' property
- with assert_raises(ValueError, msg="The transformer {} does "
- "not raise an error when the number of "
- "features in transform is different from"
- " the number of features in "
- "fit.".format(name)):
- transformer.transform(X[:, :-1])
- @ignore_warnings
- def check_pipeline_consistency(name, estimator_orig):
- if estimator_orig._get_tags()['non_deterministic']:
- msg = name + ' is non deterministic'
- raise SkipTest(msg)
- # check that make_pipeline(est) gives same score as est
- X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
- random_state=0, n_features=2, cluster_std=0.1)
- X -= X.min()
- X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
- estimat…
Large files files are truncated, but you can click here to view the full file