PageRenderTime 57ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/sklearn/utils/__init__.py

https://github.com/mblondel/scikit-learn
Python | 475 lines | 446 code | 8 blank | 21 comment | 0 complexity | a36a0cf08a755be10d24352fc4efd0d9 MD5 | raw file
  1. """
  2. The :mod:`sklearn.utils` module includes various utilities.
  3. """
  4. from collections import Sequence
  5. import numpy as np
  6. from scipy.sparse import issparse
  7. import warnings
  8. from .murmurhash import murmurhash3_32
  9. from .validation import (as_float_array,
  10. assert_all_finite, warn_if_not_float,
  11. check_random_state, column_or_1d, check_array,
  12. check_consistent_length, check_X_y, indexable,
  13. check_symmetric)
  14. from .class_weight import compute_class_weight, compute_sample_weight
  15. from ..externals.joblib import cpu_count
  16. __all__ = ["murmurhash3_32", "as_float_array",
  17. "assert_all_finite", "check_array",
  18. "warn_if_not_float",
  19. "check_random_state",
  20. "compute_class_weight", "compute_sample_weight",
  21. "column_or_1d", "safe_indexing",
  22. "check_consistent_length", "check_X_y", 'indexable',
  23. "check_symmetric"]
  24. class deprecated(object):
  25. """Decorator to mark a function or class as deprecated.
  26. Issue a warning when the function is called/the class is instantiated and
  27. adds a warning to the docstring.
  28. The optional extra argument will be appended to the deprecation message
  29. and the docstring. Note: to use this with the default value for extra, put
  30. in an empty of parentheses:
  31. >>> from sklearn.utils import deprecated
  32. >>> deprecated() # doctest: +ELLIPSIS
  33. <sklearn.utils.deprecated object at ...>
  34. >>> @deprecated()
  35. ... def some_function(): pass
  36. """
  37. # Adapted from http://wiki.python.org/moin/PythonDecoratorLibrary,
  38. # but with many changes.
  39. def __init__(self, extra=''):
  40. """
  41. Parameters
  42. ----------
  43. extra: string
  44. to be added to the deprecation messages
  45. """
  46. self.extra = extra
  47. def __call__(self, obj):
  48. if isinstance(obj, type):
  49. return self._decorate_class(obj)
  50. else:
  51. return self._decorate_fun(obj)
  52. def _decorate_class(self, cls):
  53. msg = "Class %s is deprecated" % cls.__name__
  54. if self.extra:
  55. msg += "; %s" % self.extra
  56. # FIXME: we should probably reset __new__ for full generality
  57. init = cls.__init__
  58. def wrapped(*args, **kwargs):
  59. warnings.warn(msg, category=DeprecationWarning)
  60. return init(*args, **kwargs)
  61. cls.__init__ = wrapped
  62. wrapped.__name__ = '__init__'
  63. wrapped.__doc__ = self._update_doc(init.__doc__)
  64. wrapped.deprecated_original = init
  65. return cls
  66. def _decorate_fun(self, fun):
  67. """Decorate function fun"""
  68. msg = "Function %s is deprecated" % fun.__name__
  69. if self.extra:
  70. msg += "; %s" % self.extra
  71. def wrapped(*args, **kwargs):
  72. warnings.warn(msg, category=DeprecationWarning)
  73. return fun(*args, **kwargs)
  74. wrapped.__name__ = fun.__name__
  75. wrapped.__dict__ = fun.__dict__
  76. wrapped.__doc__ = self._update_doc(fun.__doc__)
  77. return wrapped
  78. def _update_doc(self, olddoc):
  79. newdoc = "DEPRECATED"
  80. if self.extra:
  81. newdoc = "%s: %s" % (newdoc, self.extra)
  82. if olddoc:
  83. newdoc = "%s\n\n%s" % (newdoc, olddoc)
  84. return newdoc
  85. def safe_mask(X, mask):
  86. """Return a mask which is safe to use on X.
  87. Parameters
  88. ----------
  89. X : {array-like, sparse matrix}
  90. Data on which to apply mask.
  91. mask: array
  92. Mask to be used on X.
  93. Returns
  94. -------
  95. mask
  96. """
  97. mask = np.asarray(mask)
  98. if np.issubdtype(mask.dtype, np.int):
  99. return mask
  100. if hasattr(X, "toarray"):
  101. ind = np.arange(mask.shape[0])
  102. mask = ind[mask]
  103. return mask
  104. def safe_indexing(X, indices):
  105. """Return items or rows from X using indices.
  106. Allows simple indexing of lists or arrays.
  107. Parameters
  108. ----------
  109. X : array-like, sparse-matrix, list.
  110. Data from which to sample rows or items.
  111. indices : array-like, list
  112. Indices according to which X will be subsampled.
  113. """
  114. if hasattr(X, "iloc"):
  115. # Pandas Dataframes and Series
  116. return X.iloc[indices]
  117. elif hasattr(X, "shape"):
  118. if hasattr(X, 'take') and (hasattr(indices, 'dtype') and
  119. indices.dtype.kind == 'i'):
  120. # This is often substantially faster than X[indices]
  121. return X.take(indices, axis=0)
  122. else:
  123. return X[indices]
  124. else:
  125. return [X[idx] for idx in indices]
  126. def resample(*arrays, **options):
  127. """Resample arrays or sparse matrices in a consistent way
  128. The default strategy implements one step of the bootstrapping
  129. procedure.
  130. Parameters
  131. ----------
  132. *arrays : sequence of arrays or scipy.sparse matrices with same shape[0]
  133. replace : boolean, True by default
  134. Implements resampling with replacement. If False, this will implement
  135. (sliced) random permutations.
  136. n_samples : int, None by default
  137. Number of samples to generate. If left to None this is
  138. automatically set to the first dimension of the arrays.
  139. random_state : int or RandomState instance
  140. Control the shuffling for reproducible behavior.
  141. Returns
  142. -------
  143. resampled_arrays : sequence of arrays or scipy.sparse matrices with same \
  144. shape[0]
  145. Sequence of resampled views of the collections. The original arrays are
  146. not impacted.
  147. Examples
  148. --------
  149. It is possible to mix sparse and dense arrays in the same run::
  150. >>> X = [[1., 0.], [2., 1.], [0., 0.]]
  151. >>> y = np.array([0, 1, 2])
  152. >>> from scipy.sparse import coo_matrix
  153. >>> X_sparse = coo_matrix(X)
  154. >>> from sklearn.utils import resample
  155. >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
  156. >>> X
  157. array([[ 1., 0.],
  158. [ 2., 1.],
  159. [ 1., 0.]])
  160. >>> X_sparse # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  161. <3x2 sparse matrix of type '<... 'numpy.float64'>'
  162. with 4 stored elements in Compressed Sparse Row format>
  163. >>> X_sparse.toarray()
  164. array([[ 1., 0.],
  165. [ 2., 1.],
  166. [ 1., 0.]])
  167. >>> y
  168. array([0, 1, 0])
  169. >>> resample(y, n_samples=2, random_state=0)
  170. array([0, 1])
  171. See also
  172. --------
  173. :class:`sklearn.cross_validation.Bootstrap`
  174. :func:`sklearn.utils.shuffle`
  175. """
  176. random_state = check_random_state(options.pop('random_state', None))
  177. replace = options.pop('replace', True)
  178. max_n_samples = options.pop('n_samples', None)
  179. if options:
  180. raise ValueError("Unexpected kw arguments: %r" % options.keys())
  181. if len(arrays) == 0:
  182. return None
  183. first = arrays[0]
  184. n_samples = first.shape[0] if hasattr(first, 'shape') else len(first)
  185. if max_n_samples is None:
  186. max_n_samples = n_samples
  187. if max_n_samples > n_samples:
  188. raise ValueError("Cannot sample %d out of arrays with dim %d" % (
  189. max_n_samples, n_samples))
  190. check_consistent_length(*arrays)
  191. arrays = [check_array(x, accept_sparse='csr', ensure_2d=False,
  192. allow_nd=True) for x in arrays]
  193. if replace:
  194. indices = random_state.randint(0, n_samples, size=(max_n_samples,))
  195. else:
  196. indices = np.arange(n_samples)
  197. random_state.shuffle(indices)
  198. indices = indices[:max_n_samples]
  199. resampled_arrays = []
  200. for array in arrays:
  201. array = array[indices]
  202. resampled_arrays.append(array)
  203. if len(resampled_arrays) == 1:
  204. # syntactic sugar for the unit argument case
  205. return resampled_arrays[0]
  206. else:
  207. return resampled_arrays
  208. def shuffle(*arrays, **options):
  209. """Shuffle arrays or sparse matrices in a consistent way
  210. This is a convenience alias to ``resample(*arrays, replace=False)`` to do
  211. random permutations of the collections.
  212. Parameters
  213. ----------
  214. *arrays : sequence of arrays or scipy.sparse matrices with same shape[0]
  215. random_state : int or RandomState instance
  216. Control the shuffling for reproducible behavior.
  217. n_samples : int, None by default
  218. Number of samples to generate. If left to None this is
  219. automatically set to the first dimension of the arrays.
  220. Returns
  221. -------
  222. shuffled_arrays : sequence of arrays or scipy.sparse matrices with same \
  223. shape[0]
  224. Sequence of shuffled views of the collections. The original arrays are
  225. not impacted.
  226. Examples
  227. --------
  228. It is possible to mix sparse and dense arrays in the same run::
  229. >>> X = [[1., 0.], [2., 1.], [0., 0.]]
  230. >>> y = np.array([0, 1, 2])
  231. >>> from scipy.sparse import coo_matrix
  232. >>> X_sparse = coo_matrix(X)
  233. >>> from sklearn.utils import shuffle
  234. >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
  235. >>> X
  236. array([[ 0., 0.],
  237. [ 2., 1.],
  238. [ 1., 0.]])
  239. >>> X_sparse # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
  240. <3x2 sparse matrix of type '<... 'numpy.float64'>'
  241. with 3 stored elements in Compressed Sparse Row format>
  242. >>> X_sparse.toarray()
  243. array([[ 0., 0.],
  244. [ 2., 1.],
  245. [ 1., 0.]])
  246. >>> y
  247. array([2, 1, 0])
  248. >>> shuffle(y, n_samples=2, random_state=0)
  249. array([0, 1])
  250. See also
  251. --------
  252. :func:`sklearn.utils.resample`
  253. """
  254. options['replace'] = False
  255. return resample(*arrays, **options)
  256. def safe_sqr(X, copy=True):
  257. """Element wise squaring of array-likes and sparse matrices.
  258. Parameters
  259. ----------
  260. X : array like, matrix, sparse matrix
  261. copy : boolean, optional, default True
  262. Whether to create a copy of X and operate on it or to perform
  263. inplace computation (default behaviour).
  264. Returns
  265. -------
  266. X ** 2 : element wise square
  267. """
  268. X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
  269. if issparse(X):
  270. if copy:
  271. X = X.copy()
  272. X.data **= 2
  273. else:
  274. if copy:
  275. X = X ** 2
  276. else:
  277. X **= 2
  278. return X
  279. def gen_batches(n, batch_size):
  280. """Generator to create slices containing batch_size elements, from 0 to n.
  281. The last slice may contain less than batch_size elements, when batch_size
  282. does not divide n.
  283. Examples
  284. --------
  285. >>> from sklearn.utils import gen_batches
  286. >>> list(gen_batches(7, 3))
  287. [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
  288. >>> list(gen_batches(6, 3))
  289. [slice(0, 3, None), slice(3, 6, None)]
  290. >>> list(gen_batches(2, 3))
  291. [slice(0, 2, None)]
  292. """
  293. start = 0
  294. for _ in range(int(n // batch_size)):
  295. end = start + batch_size
  296. yield slice(start, end)
  297. start = end
  298. if start < n:
  299. yield slice(start, n)
  300. def gen_even_slices(n, n_packs, n_samples=None):
  301. """Generator to create n_packs slices going up to n.
  302. Pass n_samples when the slices are to be used for sparse matrix indexing;
  303. slicing off-the-end raises an exception, while it works for NumPy arrays.
  304. Examples
  305. --------
  306. >>> from sklearn.utils import gen_even_slices
  307. >>> list(gen_even_slices(10, 1))
  308. [slice(0, 10, None)]
  309. >>> list(gen_even_slices(10, 10)) #doctest: +ELLIPSIS
  310. [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]
  311. >>> list(gen_even_slices(10, 5)) #doctest: +ELLIPSIS
  312. [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]
  313. >>> list(gen_even_slices(10, 3))
  314. [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
  315. """
  316. start = 0
  317. for pack_num in range(n_packs):
  318. this_n = n // n_packs
  319. if pack_num < n % n_packs:
  320. this_n += 1
  321. if this_n > 0:
  322. end = start + this_n
  323. if n_samples is not None:
  324. end = min(n_samples, end)
  325. yield slice(start, end, None)
  326. start = end
  327. def _get_n_jobs(n_jobs):
  328. """Get number of jobs for the computation.
  329. This function reimplements the logic of joblib to determine the actual
  330. number of jobs depending on the cpu count. If -1 all CPUs are used.
  331. If 1 is given, no parallel computing code is used at all, which is useful
  332. for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
  333. Thus for n_jobs = -2, all CPUs but one are used.
  334. Parameters
  335. ----------
  336. n_jobs : int
  337. Number of jobs stated in joblib convention.
  338. Returns
  339. -------
  340. n_jobs : int
  341. The actual number of jobs as positive integer.
  342. Examples
  343. --------
  344. >>> from sklearn.utils import _get_n_jobs
  345. >>> _get_n_jobs(4)
  346. 4
  347. >>> jobs = _get_n_jobs(-2)
  348. >>> assert jobs == max(cpu_count() - 1, 1)
  349. >>> _get_n_jobs(0)
  350. Traceback (most recent call last):
  351. ...
  352. ValueError: Parameter n_jobs == 0 has no meaning.
  353. """
  354. if n_jobs < 0:
  355. return max(cpu_count() + 1 + n_jobs, 1)
  356. elif n_jobs == 0:
  357. raise ValueError('Parameter n_jobs == 0 has no meaning.')
  358. else:
  359. return n_jobs
  360. def tosequence(x):
  361. """Cast iterable x to a Sequence, avoiding a copy if possible."""
  362. if isinstance(x, np.ndarray):
  363. return np.asarray(x)
  364. elif isinstance(x, Sequence):
  365. return x
  366. else:
  367. return list(x)
  368. class ConvergenceWarning(UserWarning):
  369. """Custom warning to capture convergence problems"""
  370. class DataDimensionalityWarning(UserWarning):
  371. """Custom warning to notify potential issues with data dimensionality"""