/sklearn/linear_model/sgd_fast.pyx

https://github.com/SamuraiT/scikit-learn · Cython · 571 lines · 526 code · 28 blank · 17 comment · 8 complexity · fce96a6148cbead0460937f7f4a4db35 MD5 · raw file

  1. # cython: cdivision=True
  2. # cython: boundscheck=False
  3. # cython: wraparound=False
  4. #
  5. # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
  6. # Mathieu Blondel (partial_fit support)
  7. # Rob Zinkov (passive-aggressive)
  8. # Lars Buitinck
  9. #
  10. # Licence: BSD 3 clause
  11. import numpy as np
  12. import sys
  13. from time import time
  14. cimport cython
  15. from libc.math cimport exp, log, sqrt, pow, fabs
  16. cimport numpy as np
  17. cdef extern from "numpy/npy_math.h":
  18. bint isfinite "npy_isfinite"(double) nogil
  19. from sklearn.utils.weight_vector cimport WeightVector
  20. from sklearn.utils.seq_dataset cimport SequentialDataset
  21. np.import_array()
  22. # Penalty constants
  23. DEF NO_PENALTY = 0
  24. DEF L1 = 1
  25. DEF L2 = 2
  26. DEF ELASTICNET = 3
  27. # Learning rate constants
  28. DEF CONSTANT = 1
  29. DEF OPTIMAL = 2
  30. DEF INVSCALING = 3
  31. DEF PA1 = 4
  32. DEF PA2 = 5
  33. # ----------------------------------------
  34. # Extension Types for Loss Functions
  35. # ----------------------------------------
  36. cdef class LossFunction:
  37. """Base class for convex loss functions"""
  38. cdef double loss(self, double p, double y) nogil:
  39. """Evaluate the loss function.
  40. Parameters
  41. ----------
  42. p : double
  43. The prediction, p = w^T x
  44. y : double
  45. The true value (aka target)
  46. Returns
  47. -------
  48. double
  49. The loss evaluated at `p` and `y`.
  50. """
  51. return 0.
  52. def dloss(self, double p, double y):
  53. """Evaluate the derivative of the loss function with respect to
  54. the prediction `p`.
  55. Parameters
  56. ----------
  57. p : double
  58. The prediction, p = w^T x
  59. y : double
  60. The true value (aka target)
  61. Returns
  62. -------
  63. double
  64. The derivative of the loss function with regards to `p`.
  65. """
  66. return self._dloss(p, y)
  67. cdef double _dloss(self, double p, double y) nogil:
  68. # Implementation of dloss; separate function because cpdef and nogil
  69. # can't be combined.
  70. return 0.
  71. cdef class Regression(LossFunction):
  72. """Base class for loss functions for regression"""
  73. cdef double loss(self, double p, double y) nogil:
  74. return 0.
  75. cdef double _dloss(self, double p, double y) nogil:
  76. return 0.
  77. cdef class Classification(LossFunction):
  78. """Base class for loss functions for classification"""
  79. cdef double loss(self, double p, double y) nogil:
  80. return 0.
  81. cdef double _dloss(self, double p, double y) nogil:
  82. return 0.
  83. cdef class ModifiedHuber(Classification):
  84. """Modified Huber loss for binary classification with y in {-1, 1}
  85. This is equivalent to quadratically smoothed SVM with gamma = 2.
  86. See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
  87. Stochastic Gradient Descent', ICML'04.
  88. """
  89. cdef double loss(self, double p, double y) nogil:
  90. cdef double z = p * y
  91. if z >= 1.0:
  92. return 0.0
  93. elif z >= -1.0:
  94. return (1.0 - z) * (1.0 - z)
  95. else:
  96. return -4.0 * z
  97. cdef double _dloss(self, double p, double y) nogil:
  98. cdef double z = p * y
  99. if z >= 1.0:
  100. return 0.0
  101. elif z >= -1.0:
  102. return 2.0 * (1.0 - z) * -y
  103. else:
  104. return -4.0 * y
  105. def __reduce__(self):
  106. return ModifiedHuber, ()
  107. cdef class Hinge(Classification):
  108. """Hinge loss for binary classification tasks with y in {-1,1}
  109. Parameters
  110. ----------
  111. threshold : float > 0.0
  112. Margin threshold. When threshold=1.0, one gets the loss used by SVM.
  113. When threshold=0.0, one gets the loss used by the Perceptron.
  114. """
  115. cdef double threshold
  116. def __init__(self, double threshold=1.0):
  117. self.threshold = threshold
  118. cdef double loss(self, double p, double y) nogil:
  119. cdef double z = p * y
  120. if z <= self.threshold:
  121. return (self.threshold - z)
  122. return 0.0
  123. cdef double _dloss(self, double p, double y) nogil:
  124. cdef double z = p * y
  125. if z <= self.threshold:
  126. return -y
  127. return 0.0
  128. def __reduce__(self):
  129. return Hinge, (self.threshold,)
  130. cdef class SquaredHinge(LossFunction):
  131. """Squared Hinge loss for binary classification tasks with y in {-1,1}
  132. Parameters
  133. ----------
  134. threshold : float > 0.0
  135. Margin threshold. When threshold=1.0, one gets the loss used by
  136. (quadratically penalized) SVM.
  137. """
  138. cdef double threshold
  139. def __init__(self, double threshold=1.0):
  140. self.threshold = threshold
  141. cdef double loss(self, double p, double y) nogil:
  142. cdef double z = self.threshold - p * y
  143. if z > 0:
  144. return z * z
  145. return 0.0
  146. cdef double _dloss(self, double p, double y) nogil:
  147. cdef double z = self.threshold - p * y
  148. if z > 0:
  149. return -2 * y * z
  150. return 0.0
  151. def __reduce__(self):
  152. return SquaredHinge, (self.threshold,)
  153. cdef class Log(Classification):
  154. """Logistic regression loss for binary classification with y in {-1, 1}"""
  155. cdef double loss(self, double p, double y) nogil:
  156. cdef double z = p * y
  157. # approximately equal and saves the computation of the log
  158. if z > 18:
  159. return exp(-z)
  160. if z < -18:
  161. return -z
  162. return log(1.0 + exp(-z))
  163. cdef double _dloss(self, double p, double y) nogil:
  164. cdef double z = p * y
  165. # approximately equal and saves the computation of the log
  166. if z > 18.0:
  167. return exp(-z) * -y
  168. if z < -18.0:
  169. return -y
  170. return -y / (exp(z) + 1.0)
  171. def __reduce__(self):
  172. return Log, ()
  173. cdef class SquaredLoss(Regression):
  174. """Squared loss traditional used in linear regression."""
  175. cdef double loss(self, double p, double y) nogil:
  176. return 0.5 * (p - y) * (p - y)
  177. cdef double _dloss(self, double p, double y) nogil:
  178. return p - y
  179. def __reduce__(self):
  180. return SquaredLoss, ()
  181. cdef class Huber(Regression):
  182. """Huber regression loss
  183. Variant of the SquaredLoss that is robust to outliers (quadratic near zero,
  184. linear in for large errors).
  185. http://en.wikipedia.org/wiki/Huber_Loss_Function
  186. """
  187. cdef double c
  188. def __init__(self, double c):
  189. self.c = c
  190. cdef double loss(self, double p, double y) nogil:
  191. cdef double r = p - y
  192. cdef double abs_r = fabs(r)
  193. if abs_r <= self.c:
  194. return 0.5 * r * r
  195. else:
  196. return self.c * abs_r - (0.5 * self.c * self.c)
  197. cdef double _dloss(self, double p, double y) nogil:
  198. cdef double r = p - y
  199. cdef double abs_r = fabs(r)
  200. if abs_r <= self.c:
  201. return r
  202. elif r > 0.0:
  203. return self.c
  204. else:
  205. return -self.c
  206. def __reduce__(self):
  207. return Huber, (self.c,)
  208. cdef class EpsilonInsensitive(Regression):
  209. """Epsilon-Insensitive loss (used by SVR).
  210. loss = max(0, |y - p| - epsilon)
  211. """
  212. cdef double epsilon
  213. def __init__(self, double epsilon):
  214. self.epsilon = epsilon
  215. cdef double loss(self, double p, double y) nogil:
  216. cdef double ret = fabs(y - p) - self.epsilon
  217. return ret if ret > 0 else 0
  218. cdef double _dloss(self, double p, double y) nogil:
  219. if y - p > self.epsilon:
  220. return -1
  221. elif p - y > self.epsilon:
  222. return 1
  223. else:
  224. return 0
  225. def __reduce__(self):
  226. return EpsilonInsensitive, (self.epsilon,)
  227. cdef class SquaredEpsilonInsensitive(Regression):
  228. """Epsilon-Insensitive loss.
  229. loss = max(0, |y - p| - epsilon)^2
  230. """
  231. cdef double epsilon
  232. def __init__(self, double epsilon):
  233. self.epsilon = epsilon
  234. cdef double loss(self, double p, double y) nogil:
  235. cdef double ret = fabs(y - p) - self.epsilon
  236. return ret * ret if ret > 0 else 0
  237. cdef double _dloss(self, double p, double y) nogil:
  238. cdef double z
  239. z = y - p
  240. if z > self.epsilon:
  241. return -2 * (z - self.epsilon)
  242. elif z < self.epsilon:
  243. return 2 * (-z - self.epsilon)
  244. else:
  245. return 0
  246. def __reduce__(self):
  247. return SquaredEpsilonInsensitive, (self.epsilon,)
  248. def plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
  249. double intercept,
  250. LossFunction loss,
  251. int penalty_type,
  252. double alpha, double C,
  253. double l1_ratio,
  254. SequentialDataset dataset,
  255. int n_iter, int fit_intercept,
  256. int verbose, bint shuffle, np.uint32_t seed,
  257. double weight_pos, double weight_neg,
  258. int learning_rate, double eta0,
  259. double power_t,
  260. double t=1.0,
  261. double intercept_decay=1.0):
  262. """Plain SGD for generic loss functions and penalties.
  263. Parameters
  264. ----------
  265. weights : ndarray[double, ndim=1]
  266. The allocated coef_ vector.
  267. intercept : double
  268. The initial intercept.
  269. loss : LossFunction
  270. A concrete ``LossFunction`` object.
  271. penalty_type : int
  272. The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net.
  273. alpha : float
  274. The regularization parameter.
  275. C : float
  276. Maximum step size for passive aggressive.
  277. l1_ratio : float
  278. The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
  279. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
  280. dataset : SequentialDataset
  281. A concrete ``SequentialDataset`` object.
  282. n_iter : int
  283. The number of iterations (epochs).
  284. fit_intercept : int
  285. Whether or not to fit the intercept (1 or 0).
  286. verbose : int
  287. Print verbose output; 0 for quite.
  288. shuffle : boolean
  289. Whether to shuffle the training data before each epoch.
  290. weight_pos : float
  291. The weight of the positive class.
  292. weight_neg : float
  293. The weight of the negative class.
  294. seed : np.uint32_t
  295. Seed of the pseudorandom number generator used to shuffle the data.
  296. learning_rate : int
  297. The learning rate:
  298. (1) constant, eta = eta0
  299. (2) optimal, eta = 1.0/(t+t0)
  300. (3) inverse scaling, eta = eta0 / pow(t, power_t)
  301. (4) Passive Agressive-I, eta = min(alpha, loss/norm(x))
  302. (5) Passive Agressive-II, eta = 1.0 / (norm(x) + 0.5*alpha)
  303. eta0 : double
  304. The initial learning rate.
  305. power_t : double
  306. The exponent for inverse scaling learning rate.
  307. t : double
  308. Initial state of the learning rate. This value is equal to the
  309. iteration count except when the learning rate is set to `optimal`.
  310. Default: 1.0.
  311. Returns
  312. -------
  313. weights : array, shape=[n_features]
  314. The fitted weight vector.
  315. intercept : float
  316. The fitted intercept term.
  317. """
  318. # get the data information into easy vars
  319. cdef Py_ssize_t n_samples = dataset.n_samples
  320. cdef Py_ssize_t n_features = weights.shape[0]
  321. cdef WeightVector w = WeightVector(weights)
  322. cdef double *x_data_ptr = NULL
  323. cdef int *x_ind_ptr = NULL
  324. # helper variables
  325. cdef bint infinity = False
  326. cdef int xnnz
  327. cdef double eta = 0.0
  328. cdef double p = 0.0
  329. cdef double update = 0.0
  330. cdef double sumloss = 0.0
  331. cdef double y = 0.0
  332. cdef double sample_weight
  333. cdef double class_weight = 1.0
  334. cdef unsigned int count = 0
  335. cdef unsigned int epoch = 0
  336. cdef unsigned int i = 0
  337. cdef int is_hinge = isinstance(loss, Hinge)
  338. # q vector is only used for L1 regularization
  339. cdef np.ndarray[double, ndim = 1, mode = "c"] q = None
  340. cdef double * q_data_ptr = NULL
  341. if penalty_type == L1 or penalty_type == ELASTICNET:
  342. q = np.zeros((n_features,), dtype=np.float64, order="c")
  343. q_data_ptr = <double * > q.data
  344. cdef double u = 0.0
  345. if penalty_type == L2:
  346. l1_ratio = 0.0
  347. elif penalty_type == L1:
  348. l1_ratio = 1.0
  349. eta = eta0
  350. t_start = time()
  351. with nogil:
  352. for epoch in range(n_iter):
  353. if verbose > 0:
  354. with gil:
  355. print("-- Epoch %d" % (epoch + 1))
  356. if shuffle:
  357. dataset.shuffle(seed)
  358. for i in range(n_samples):
  359. dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz, &y, &sample_weight)
  360. p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept
  361. if learning_rate == OPTIMAL:
  362. eta = 1.0 / (alpha * t)
  363. elif learning_rate == INVSCALING:
  364. eta = eta0 / pow(t, power_t)
  365. if verbose > 0:
  366. sumloss += loss.loss(p, y)
  367. if y > 0.0:
  368. class_weight = weight_pos
  369. else:
  370. class_weight = weight_neg
  371. if learning_rate == PA1:
  372. update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
  373. if update == 0:
  374. continue
  375. update = min(C, loss.loss(p, y) / update)
  376. elif learning_rate == PA2:
  377. update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
  378. update = loss.loss(p, y) / (update + 0.5 / C)
  379. else:
  380. update = -eta * loss._dloss(p, y)
  381. if learning_rate >= PA1:
  382. if is_hinge:
  383. # classification
  384. update *= y
  385. elif y - p < 0:
  386. # regression
  387. update *= -1
  388. update *= class_weight * sample_weight
  389. if penalty_type >= L2:
  390. w.scale(1.0 - ((1.0 - l1_ratio) * eta * alpha))
  391. if update != 0.0:
  392. w.add(x_data_ptr, x_ind_ptr, xnnz, update)
  393. if fit_intercept == 1:
  394. intercept += update * intercept_decay
  395. if penalty_type == L1 or penalty_type == ELASTICNET:
  396. u += (l1_ratio * eta * alpha)
  397. l1penalty(w, q_data_ptr, x_ind_ptr, xnnz, u)
  398. t += 1
  399. count += 1
  400. # report epoch information
  401. if verbose > 0:
  402. with gil:
  403. print("Norm: %.2f, NNZs: %d, "
  404. "Bias: %.6f, T: %d, Avg. loss: %.6f"
  405. % (w.norm(), weights.nonzero()[0].shape[0],
  406. intercept, count, sumloss / count))
  407. print("Total training time: %.2f seconds."
  408. % (time() - t_start))
  409. # floating-point under-/overflow check.
  410. if (not isfinite(intercept)
  411. or any_nonfinite(<double *>weights.data, n_features)):
  412. infinity = True
  413. break
  414. if infinity:
  415. raise ValueError(("Floating-point under-/overflow occurred at epoch"
  416. " #%d. Scaling input data with StandardScaler or"
  417. " MinMaxScaler might help.") % (epoch + 1))
  418. w.reset_wscale()
  419. return weights, intercept
  420. cdef bint any_nonfinite(double *w, int n) nogil:
  421. for i in range(n):
  422. if not isfinite(w[i]):
  423. return True
  424. return 0
  425. cdef double sqnorm(double * x_data_ptr, int * x_ind_ptr, int xnnz) nogil:
  426. cdef double x_norm = 0.0
  427. cdef int j
  428. cdef double z
  429. for j in range(xnnz):
  430. z = x_data_ptr[j]
  431. x_norm += z * z
  432. return x_norm
  433. cdef void l1penalty(WeightVector w, double * q_data_ptr,
  434. int *x_ind_ptr, int xnnz, double u) nogil:
  435. """Apply the L1 penalty to each updated feature
  436. This implements the truncated gradient approach by
  437. [Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009].
  438. """
  439. cdef double z = 0.0
  440. cdef int j = 0
  441. cdef int idx = 0
  442. cdef double wscale = w.wscale
  443. cdef double *w_data_ptr = w.w_data_ptr
  444. for j in range(xnnz):
  445. idx = x_ind_ptr[j]
  446. z = w_data_ptr[idx]
  447. if wscale * w_data_ptr[idx] > 0.0:
  448. w_data_ptr[idx] = max(
  449. 0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale))
  450. elif wscale * w_data_ptr[idx] < 0.0:
  451. w_data_ptr[idx] = min(
  452. 0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))
  453. q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)