diff --git a/examples/plot_svrg.py b/examples/plot_svrg.py index 0bc2df08..3f892841 100644 --- a/examples/plot_svrg.py +++ b/examples/plot_svrg.py @@ -71,7 +71,8 @@ def __call__(self, clf): print("eta =", eta) cb = Callback(X, y) clf = SVRGClassifier(loss="squared_hinge", alpha=1e-5, eta=eta, - n_inner=1.0, max_iter=20, random_state=0, callback=cb) + do_averaging=True, n_inner=1.0, max_iter=20, + random_state=0, callback=cb) clf.fit(X, y) plt.plot(cb.times, cb.obj, label="eta=" + str(eta)) @@ -85,8 +86,8 @@ def __call__(self, clf): print("n_inner =", n_inner) cb = Callback(X, y) clf = SVRGClassifier(loss="squared_hinge", alpha=1e-5, eta=1e-4, - n_inner=n_inner, max_iter=20, random_state=0, - callback=cb) + do_averaging=True, n_inner=n_inner, max_iter=20, + random_state=0, callback=cb) clf.fit(X, y) plt.plot(cb.times, cb.obj, label="n_inner=" + str(n_inner)) diff --git a/lightning/impl/svrg.py b/lightning/impl/svrg.py index c6055d26..f3a56461 100644 --- a/lightning/impl/svrg.py +++ b/lightning/impl/svrg.py @@ -20,10 +20,9 @@ class _BaseSVRG(object): def _finalize_coef(self): - self.coef_ *= self.coef_scale_ - self.coef_scale_.fill(1.0) + pass - def _fit(self, X, Y): + def _fit(self, X, Y, do_averaging): n_samples, n_features = X.shape rng = self._get_random_state() loss = self._get_loss() @@ -32,16 +31,16 @@ def _fit(self, X, Y): ds = get_dataset(X, order="c") self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) + self.average_coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) full_grad = np.zeros_like(self.coef_) grad = np.zeros((n_vectors, n_samples), dtype=np.float64) - self.coef_scale_ = np.ones(n_vectors, dtype=np.float64) for i in xrange(n_vectors): y = Y[:, i] - _svrg_fit(self, ds, y, self.coef_[i], self.coef_scale_[i:], + _svrg_fit(self, ds, y, self.coef_[i], self.average_coef_[i], full_grad[i], grad[i], self.eta, self.alpha, loss, - self.max_iter, n_inner, self.tol, self.verbose, + self.max_iter, n_inner, self.tol, self.verbose, self.do_averaging, self.callback, rng) return self @@ -59,7 +58,7 @@ class SVRGClassifier(BaseClassifier, _BaseSVRG): def __init__(self, eta=1.0, alpha=1.0, loss="smooth_hinge", gamma=1.0, max_iter=10, n_inner=1.0, tol=1e-3, verbose=0, - callback=None, random_state=None): + callback=None, random_state=None, do_averaging=False): self.eta = eta self.alpha = alpha self.loss = loss @@ -70,6 +69,7 @@ def __init__(self, eta=1.0, alpha=1.0, loss="smooth_hinge", gamma=1.0, self.verbose = verbose self.callback = callback self.random_state = random_state + self.do_averaging = do_averaging def _get_loss(self): losses = { @@ -85,7 +85,7 @@ def fit(self, X, y): self._set_label_transformers(y) Y = np.asfortranarray(self.label_binarizer_.transform(y), dtype=np.float64) - return self._fit(X, Y) + return self._fit(X, Y, self.do_averaging) class SVRGRegressor(BaseRegressor, _BaseSVRG): @@ -100,7 +100,7 @@ class SVRGRegressor(BaseRegressor, _BaseSVRG): def __init__(self, eta=1.0, alpha=1.0, loss="squared", gamma=1.0, max_iter=10, n_inner=1.0, tol=1e-3, verbose=0, - callback=None, random_state=None): + callback=None, random_state=None, do_averaging=False): self.eta = eta self.alpha = alpha self.loss = loss @@ -111,6 +111,7 @@ def __init__(self, eta=1.0, alpha=1.0, loss="squared", gamma=1.0, self.verbose = verbose self.callback = callback self.random_state = random_state + self.do_averaging = do_averaging def _get_loss(self): losses = { @@ -122,4 +123,4 @@ def fit(self, X, y): self.outputs_2d_ = len(y.shape) > 1 Y = y.reshape(-1, 1) if not self.outputs_2d_ else y Y = Y.astype(np.float64) - return self._fit(X, Y) + return self._fit(X, Y, self.do_averaging) diff --git a/lightning/impl/svrg_fast.pyx b/lightning/impl/svrg_fast.pyx index aaed6a8e..6ae85d10 100644 --- a/lightning/impl/svrg_fast.pyx +++ b/lightning/impl/svrg_fast.pyx @@ -4,6 +4,7 @@ # cython: wraparound=False # # Author: Mathieu Blondel +# Krishna Pillutla (averaging support) # License: BSD import numpy as np @@ -12,6 +13,7 @@ cimport numpy as np ctypedef np.int64_t LONG from libc.math cimport sqrt +from libc.math cimport pow as powc from lightning.impl.randomkit.random_fast cimport RandomState from lightning.impl.dataset_fast cimport RowDataset @@ -49,7 +51,7 @@ def _svrg_fit(self, RowDataset X, np.ndarray[double, ndim=1]y, np.ndarray[double, ndim=1]coef, - np.ndarray[double, ndim=1]coef_scale, + np.ndarray[double, ndim=1]avg_coef, np.ndarray[double, ndim=1]full_grad, np.ndarray[double, ndim=1]grad, double eta, @@ -59,6 +61,7 @@ def _svrg_fit(self, int n_inner, double tol, int verbose, + int do_averaging, callback, RandomState rng): @@ -71,7 +74,13 @@ def _svrg_fit(self, cdef double violation, violation_init, violation_ratio cdef double eta_avg = eta / n_samples cdef double eta_alpha = eta * alpha + cdef double one_minus_eta_alpha = 1 - eta_alpha + cdef double one_over_eta_alpha = 1 / eta_alpha if eta_alpha > 0 else 0.0 cdef int has_callback = callback is not None + cdef double w_scale = 1.0 + cdef double avg_a = 0.0, avg_b = 1.0 + cdef double correction, correction_avg + cdef double mu # Data pointers. cdef double* data @@ -79,15 +88,14 @@ def _svrg_fit(self, cdef int n_nz # Buffers and pointers. - cdef np.ndarray[int, ndim=1]last = np.zeros(n_features, dtype=np.int32) cdef double* w = coef.data - cdef double* w_scale = coef_scale.data + cdef double* w_avg = avg_coef.data cdef double* fg = full_grad.data cdef double* g = grad.data for it in xrange(max_iter): - # Reset full gradient + # Reset full gradient. for j in xrange(n_features): fg[j] = 0 @@ -98,7 +106,7 @@ def _svrg_fit(self, X.get_row_ptr(i, &indices, &data, &n_nz) # Make prediction. - y_pred = _pred(data, indices, n_nz, w) * w_scale[0] + y_pred = _pred(data, indices, n_nz, w) * w_scale # A gradient is given by g[i] * X[i]. g[i] = -loss.get_update(y_pred, y[i]) @@ -107,7 +115,7 @@ def _svrg_fit(self, # Compute optimality violation. violation = 0 - alpha_scaled = alpha * w_scale[0] + alpha_scaled = alpha * w_scale for j in xrange(n_features): tmp = fg[j] / n_samples + alpha_scaled * w[j] violation += tmp * tmp @@ -134,46 +142,65 @@ def _svrg_fit(self, # Retrieve sample i. X.get_row_ptr(i, &indices, &data, &n_nz) - # Add deterministic part, just in time. - if t > 0: - for jj in xrange(n_nz): - j = indices[jj] - w[j] -= eta_avg / w_scale[0] * (t - last[j]) * fg[j] - last[j] = t - - # Make prediction. - y_pred = _pred(data, indices, n_nz, w) * w_scale[0] + # Make prediction, accounting for correction due to + # dense (deterministic) part of update. + y_pred = _pred(data, indices, n_nz, w) * w_scale + if eta_alpha > 0: + correction = (1 - powc(one_minus_eta_alpha, t)) / eta_alpha + else: + correction = t + y_pred -= _pred(data, indices, n_nz, fg) * eta_avg * correction # A gradient is given by scale * X[i]. scale = -loss.get_update(y_pred, y[i]) - w_scale[0] *= (1 - eta_alpha) - - # Add deterministic part. - #for j in xrange(n_features): - #w[j] -= eta_avg / w_scale * fg[j] + w_scale *= (1 - eta_alpha) # Add stochastic part. - _add(data, indices, n_nz, eta * (g[i] - scale) / w_scale[0], w) + _add(data, indices, n_nz, eta * (g[i] - scale) / w_scale, w) - # Take care of possible underflows. - if w_scale[0] < 1e-9: + # Update average (or reset, at t = 0) of stochastic part. + if t == 0: for j in xrange(n_features): - w[j] *= w_scale[0] - w_scale[0] = 1.0 + w_avg[j] = 0.0 + avg_a = w_scale + avg_b = 1.0 + else: + mu = 1.0 / (t + 1) + _add(data, indices, n_nz, eta * (scale - g[i]) * avg_a / w_scale, w_avg) + avg_b /= (1.0 - mu) + avg_a += mu * avg_b * w_scale - # Finalize. + # Take care of possible underflows. + if w_scale < 1e-9: + for j in xrange(n_features): + w[j] *= w_scale + avg_a /= w_scale + w_scale = 1.0 + + # Finalize. Reconstruct w and w_avg. Add deterministic update to w and w_avg. + if eta_alpha > 0: + correction = (1.0 - powc(one_minus_eta_alpha, n_inner)) / eta_alpha + correction_avg = one_over_eta_alpha - one_minus_eta_alpha * correction / (n_inner * eta_alpha) + else: + correction = n_inner + correction_avg = (n_inner - 1.0) / 2.0 for j in xrange(n_features): - w[j] -= eta_avg / w_scale[0] * (n_inner - last[j]) * fg[j] - last[j] = 0 + w_avg[j] = (w_avg[j] + avg_a * w[j]) / avg_b + w_avg[j] -= eta_avg * fg[j] * correction_avg + w[j] *= w_scale + w[j] -= eta_avg * fg[j] * correction + w_scale = 1.0 + avg_a = 0.0 + avg_b = 1.0 + + # Update iterate, if averaging + if do_averaging: + for j in xrange(n_features): + w[j] = w_avg[j] # Callback. if has_callback: ret = callback(self) if ret is not None: break - - # Rescale coefficients. - for j in xrange(n_features): - w[j] *= w_scale[0] - w_scale[0] = 1.0