PageRenderTime 32ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/sklearn/utils/multiclass.py

https://gitlab.com/0072016/0072016
Python | 388 lines | 317 code | 28 blank | 43 comment | 28 complexity | 45f569b7886575882f2cf5dd7316449a MD5 | raw file
  1. # Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi
  2. #
  3. # License: BSD 3 clause
  4. """
  5. Multi-class / multi-label utility function
  6. ==========================================
  7. """
  8. from __future__ import division
  9. from collections import Sequence
  10. from itertools import chain
  11. from scipy.sparse import issparse
  12. from scipy.sparse.base import spmatrix
  13. from scipy.sparse import dok_matrix
  14. from scipy.sparse import lil_matrix
  15. import numpy as np
  16. from ..externals.six import string_types
  17. from .validation import check_array
  18. from ..utils.fixes import bincount
  19. from ..utils.fixes import array_equal
  20. def _unique_multiclass(y):
  21. if hasattr(y, '__array__'):
  22. return np.unique(np.asarray(y))
  23. else:
  24. return set(y)
  25. def _unique_indicator(y):
  26. return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1])
  27. _FN_UNIQUE_LABELS = {
  28. 'binary': _unique_multiclass,
  29. 'multiclass': _unique_multiclass,
  30. 'multilabel-indicator': _unique_indicator,
  31. }
  32. def unique_labels(*ys):
  33. """Extract an ordered array of unique labels
  34. We don't allow:
  35. - mix of multilabel and multiclass (single label) targets
  36. - mix of label indicator matrix and anything else,
  37. because there are no explicit labels)
  38. - mix of label indicator matrices of different sizes
  39. - mix of string and integer labels
  40. At the moment, we also don't allow "multiclass-multioutput" input type.
  41. Parameters
  42. ----------
  43. *ys : array-likes,
  44. Returns
  45. -------
  46. out : numpy array of shape [n_unique_labels]
  47. An ordered array of unique labels.
  48. Examples
  49. --------
  50. >>> from sklearn.utils.multiclass import unique_labels
  51. >>> unique_labels([3, 5, 5, 5, 7, 7])
  52. array([3, 5, 7])
  53. >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
  54. array([1, 2, 3, 4])
  55. >>> unique_labels([1, 2, 10], [5, 11])
  56. array([ 1, 2, 5, 10, 11])
  57. """
  58. if not ys:
  59. raise ValueError('No argument has been passed.')
  60. # Check that we don't mix label format
  61. ys_types = set(type_of_target(x) for x in ys)
  62. if ys_types == set(["binary", "multiclass"]):
  63. ys_types = set(["multiclass"])
  64. if len(ys_types) > 1:
  65. raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
  66. label_type = ys_types.pop()
  67. # Check consistency for the indicator format
  68. if (label_type == "multilabel-indicator" and
  69. len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1]
  70. for y in ys)) > 1):
  71. raise ValueError("Multi-label binary indicator input with "
  72. "different numbers of labels")
  73. # Get the unique set of labels
  74. _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
  75. if not _unique_labels:
  76. raise ValueError("Unknown label type: %s" % repr(ys))
  77. ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys))
  78. # Check that we don't mix string type with number type
  79. if (len(set(isinstance(label, string_types) for label in ys_labels)) > 1):
  80. raise ValueError("Mix of label input types (string and number)")
  81. return np.array(sorted(ys_labels))
  82. def _is_integral_float(y):
  83. return y.dtype.kind == 'f' and np.all(y.astype(int) == y)
  84. def is_multilabel(y):
  85. """ Check if ``y`` is in a multilabel format.
  86. Parameters
  87. ----------
  88. y : numpy array of shape [n_samples]
  89. Target values.
  90. Returns
  91. -------
  92. out : bool,
  93. Return ``True``, if ``y`` is in a multilabel format, else ```False``.
  94. Examples
  95. --------
  96. >>> import numpy as np
  97. >>> from sklearn.utils.multiclass import is_multilabel
  98. >>> is_multilabel([0, 1, 0, 1])
  99. False
  100. >>> is_multilabel([[1], [0, 2], []])
  101. False
  102. >>> is_multilabel(np.array([[1, 0], [0, 0]]))
  103. True
  104. >>> is_multilabel(np.array([[1], [0], [0]]))
  105. False
  106. >>> is_multilabel(np.array([[1, 0, 0]]))
  107. True
  108. """
  109. if hasattr(y, '__array__'):
  110. y = np.asarray(y)
  111. if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
  112. return False
  113. if issparse(y):
  114. if isinstance(y, (dok_matrix, lil_matrix)):
  115. y = y.tocsr()
  116. return (len(y.data) == 0 or np.unique(y.data).size == 1 and
  117. (y.dtype.kind in 'biu' or # bool, int, uint
  118. _is_integral_float(np.unique(y.data))))
  119. else:
  120. labels = np.unique(y)
  121. return len(labels) < 3 and (y.dtype.kind in 'biu' or # bool, int, uint
  122. _is_integral_float(labels))
  123. def check_classification_targets(y):
  124. """Ensure that target y is of a non-regression type.
  125. Only the following target types (as defined in type_of_target) are allowed:
  126. 'binary', 'multiclass', 'multiclass-multioutput',
  127. 'multilabel-indicator', 'multilabel-sequences'
  128. Parameters
  129. ----------
  130. y : array-like
  131. """
  132. y_type = type_of_target(y)
  133. if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
  134. 'multilabel-indicator', 'multilabel-sequences']:
  135. raise ValueError("Unknown label type: %r" % y_type)
  136. def type_of_target(y):
  137. """Determine the type of data indicated by target `y`
  138. Parameters
  139. ----------
  140. y : array-like
  141. Returns
  142. -------
  143. target_type : string
  144. One of:
  145. * 'continuous': `y` is an array-like of floats that are not all
  146. integers, and is 1d or a column vector.
  147. * 'continuous-multioutput': `y` is a 2d array of floats that are
  148. not all integers, and both dimensions are of size > 1.
  149. * 'binary': `y` contains <= 2 discrete values and is 1d or a column
  150. vector.
  151. * 'multiclass': `y` contains more than two discrete values, is not a
  152. sequence of sequences, and is 1d or a column vector.
  153. * 'multiclass-multioutput': `y` is a 2d array that contains more
  154. than two discrete values, is not a sequence of sequences, and both
  155. dimensions are of size > 1.
  156. * 'multilabel-indicator': `y` is a label indicator matrix, an array
  157. of two dimensions with at least two columns, and at most 2 unique
  158. values.
  159. * 'unknown': `y` is array-like but none of the above, such as a 3d
  160. array, sequence of sequences, or an array of non-sequence objects.
  161. Examples
  162. --------
  163. >>> import numpy as np
  164. >>> type_of_target([0.1, 0.6])
  165. 'continuous'
  166. >>> type_of_target([1, -1, -1, 1])
  167. 'binary'
  168. >>> type_of_target(['a', 'b', 'a'])
  169. 'binary'
  170. >>> type_of_target([1.0, 2.0])
  171. 'binary'
  172. >>> type_of_target([1, 0, 2])
  173. 'multiclass'
  174. >>> type_of_target([1.0, 0.0, 3.0])
  175. 'multiclass'
  176. >>> type_of_target(['a', 'b', 'c'])
  177. 'multiclass'
  178. >>> type_of_target(np.array([[1, 2], [3, 1]]))
  179. 'multiclass-multioutput'
  180. >>> type_of_target([[1, 2]])
  181. 'multiclass-multioutput'
  182. >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
  183. 'continuous-multioutput'
  184. >>> type_of_target(np.array([[0, 1], [1, 1]]))
  185. 'multilabel-indicator'
  186. """
  187. valid = ((isinstance(y, (Sequence, spmatrix)) or hasattr(y, '__array__'))
  188. and not isinstance(y, string_types))
  189. if not valid:
  190. raise ValueError('Expected array-like (array or non-string sequence), '
  191. 'got %r' % y)
  192. if is_multilabel(y):
  193. return 'multilabel-indicator'
  194. try:
  195. y = np.asarray(y)
  196. except ValueError:
  197. # Known to fail in numpy 1.3 for array of arrays
  198. return 'unknown'
  199. # The old sequence of sequences format
  200. try:
  201. if (not hasattr(y[0], '__array__') and isinstance(y[0], Sequence)
  202. and not isinstance(y[0], string_types)):
  203. raise ValueError('You appear to be using a legacy multi-label data'
  204. ' representation. Sequence of sequences are no'
  205. ' longer supported; use a binary array or sparse'
  206. ' matrix instead.')
  207. except IndexError:
  208. pass
  209. # Invalid inputs
  210. if y.ndim > 2 or (y.dtype == object and len(y) and
  211. not isinstance(y.flat[0], string_types)):
  212. return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"]
  213. if y.ndim == 2 and y.shape[1] == 0:
  214. return 'unknown' # [[]]
  215. if y.ndim == 2 and y.shape[1] > 1:
  216. suffix = "-multioutput" # [[1, 2], [1, 2]]
  217. else:
  218. suffix = "" # [1, 2, 3] or [[1], [2], [3]]
  219. # check float and contains non-integer float values
  220. if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
  221. # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
  222. return 'continuous' + suffix
  223. if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
  224. return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
  225. else:
  226. return 'binary' # [1, 2] or [["a"], ["b"]]
  227. def _check_partial_fit_first_call(clf, classes=None):
  228. """Private helper function for factorizing common classes param logic
  229. Estimators that implement the ``partial_fit`` API need to be provided with
  230. the list of possible classes at the first call to partial_fit.
  231. Subsequent calls to partial_fit should check that ``classes`` is still
  232. consistent with a previous value of ``clf.classes_`` when provided.
  233. This function returns True if it detects that this was the first call to
  234. ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
  235. set on ``clf``.
  236. """
  237. if getattr(clf, 'classes_', None) is None and classes is None:
  238. raise ValueError("classes must be passed on the first call "
  239. "to partial_fit.")
  240. elif classes is not None:
  241. if getattr(clf, 'classes_', None) is not None:
  242. if not array_equal(clf.classes_, unique_labels(classes)):
  243. raise ValueError(
  244. "`classes=%r` is not the same as on last call "
  245. "to partial_fit, was: %r" % (classes, clf.classes_))
  246. else:
  247. # This is the first call to partial_fit
  248. clf.classes_ = unique_labels(classes)
  249. return True
  250. # classes is None and clf.classes_ has already previously been set:
  251. # nothing to do
  252. return False
  253. def class_distribution(y, sample_weight=None):
  254. """Compute class priors from multioutput-multiclass target data
  255. Parameters
  256. ----------
  257. y : array like or sparse matrix of size (n_samples, n_outputs)
  258. The labels for each example.
  259. sample_weight : array-like of shape = (n_samples,), optional
  260. Sample weights.
  261. Returns
  262. -------
  263. classes : list of size n_outputs of arrays of size (n_classes,)
  264. List of classes for each column.
  265. n_classes : list of integers of size n_outputs
  266. Number of classes in each column
  267. class_prior : list of size n_outputs of arrays of size (n_classes,)
  268. Class distribution of each column.
  269. """
  270. classes = []
  271. n_classes = []
  272. class_prior = []
  273. n_samples, n_outputs = y.shape
  274. if issparse(y):
  275. y = y.tocsc()
  276. y_nnz = np.diff(y.indptr)
  277. for k in range(n_outputs):
  278. col_nonzero = y.indices[y.indptr[k]:y.indptr[k + 1]]
  279. # separate sample weights for zero and non-zero elements
  280. if sample_weight is not None:
  281. nz_samp_weight = np.asarray(sample_weight)[col_nonzero]
  282. zeros_samp_weight_sum = (np.sum(sample_weight) -
  283. np.sum(nz_samp_weight))
  284. else:
  285. nz_samp_weight = None
  286. zeros_samp_weight_sum = y.shape[0] - y_nnz[k]
  287. classes_k, y_k = np.unique(y.data[y.indptr[k]:y.indptr[k + 1]],
  288. return_inverse=True)
  289. class_prior_k = bincount(y_k, weights=nz_samp_weight)
  290. # An explicit zero was found, combine its weight with the weight
  291. # of the implicit zeros
  292. if 0 in classes_k:
  293. class_prior_k[classes_k == 0] += zeros_samp_weight_sum
  294. # If an there is an implicit zero and it is not in classes and
  295. # class_prior, make an entry for it
  296. if 0 not in classes_k and y_nnz[k] < y.shape[0]:
  297. classes_k = np.insert(classes_k, 0, 0)
  298. class_prior_k = np.insert(class_prior_k, 0,
  299. zeros_samp_weight_sum)
  300. classes.append(classes_k)
  301. n_classes.append(classes_k.shape[0])
  302. class_prior.append(class_prior_k / class_prior_k.sum())
  303. else:
  304. for k in range(n_outputs):
  305. classes_k, y_k = np.unique(y[:, k], return_inverse=True)
  306. classes.append(classes_k)
  307. n_classes.append(classes_k.shape[0])
  308. class_prior_k = bincount(y_k, weights=sample_weight)
  309. class_prior.append(class_prior_k / class_prior_k.sum())
  310. return (classes, n_classes, class_prior)