Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

[FEATURE] Add "Reduce LR On Plateau" scheduler #897

Open
wants to merge 2 commits into
base: v0.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/gluonnlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from . import optimizer
from . import initializer
from .vocab import Vocab
from . import lr_scheduler

__version__ = '0.8.0.dev'

Expand All @@ -42,4 +43,5 @@
'initializer',
'optimizer',
'utils',
'metric']
'metric',
'lr_scheduler']
25 changes: 25 additions & 0 deletions src/gluonnlp/lr_scheduler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# coding: utf-8

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=wildcard-import
"""NLP LR scheduler."""

from .reduce_lr_on_plateau import *

__all__ = reduce_lr_on_plateau.__all__
210 changes: 210 additions & 0 deletions src/gluonnlp/lr_scheduler/reduce_lr_on_plateau.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
# coding: utf-8

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Reduce LR on Plateau"""

__all__ = ['ReduceLROnPlateau']

from functools import partial

import numpy as np
from mxnet import gluon


class ReduceLROnPlateau:
r"""Reduce learning rate when a metric has stopped improving.

Models often benefit from reducing the learning rate by a factor
of 2-10 once learning stagnates. This scheduler reads a metrics
quantity and if no improvement is seen for a 'patience' number
of epochs, the learning rate is reduced.

Parameters
----------
trainer : mxnet.gluon.Trainer
Wrapped trainer.
mode : str, default 'min'
One of `min`, `max`. In `min` mode, lr will
be reduced when the quantity monitored has stopped
decreasing; in `max` mode it will be reduced when the
quantity monitored has stopped increasing.
factor : float, default 0.1
Factor by which the learning rate will be
reduced. new_lr = lr * factor.
patience : int, default 10
Number of epochs with no improvement after
which learning rate will be reduced. For example, if
`patience = 2`, then we will ignore the first 2 epochs
with no improvement, and will only decrease the LR after the
3rd epoch if the loss still hasn't improved then.
verbose : bool, default False
If True, prints a message to stdout for
each update.
threshold : float, default 1e-4
Threshold for measuring the new optimum,
to only focus on significant changes.
threshold_mode : str, default 'rel'
One of `rel`, `abs`. In `rel` mode,
dynamic_threshold = best * ( 1 + threshold ) in 'max'
mode or best * ( 1 - threshold ) in `min` mode.
In `abs` mode, dynamic_threshold = best + threshold in
`max` mode or best - threshold in `min` mode.
cooldown : int, default 0
Number of epochs to wait before resuming
normal operation after lr has been reduced.
min_lr : float, default 0
A lower bound on the learning rate of all param groups
or each group respectively.
eps : float, default 1e-8
Minimal decay applied to lr. If the difference
between new and old lr is smaller than eps, the update is
ignored.

Examples
--------

>>> model = gluon.nn.Dense(10)
>>> model.initialize()
>>> trainer = gluon.Trainer(model.collect_params(), 'SGD')
>>> scheduler = ReduceLROnPlateau(trainer, 'min')
>>> for epoch in range(10): # doctest: +SKIP
>>> train(...) # doctest: +SKIP
>>> val_loss = validate(...) # doctest: +SKIP
>>> # Note that step should be called after validate()
>>> scheduler.step(val_loss) # doctest: +SKIP
"""

def __init__(self,
trainer,
mode='min',
factor=0.1,
patience=10,
verbose=False,
threshold=1e-4,
threshold_mode='rel',
cooldown=0,
min_lr=0,
eps=1e-8):

if factor >= 1.0:
raise ValueError('Factor should be < 1.0.')
self.factor = factor

if not isinstance(trainer, gluon.Trainer):
raise TypeError('{} is not an mxnet.trainer.trainer'.format(
type(trainer).__name__))
self.trainer = trainer

self.min_lr = min_lr

self.patience = patience
self.verbose = verbose
self.cooldown = cooldown
self.cooldown_counter = 0
self.mode = mode
self.threshold = threshold
self.threshold_mode = threshold_mode
self.best = None
self.num_bad_epochs = None
self.mode_worse = None # the worse value for the chosen mode
self.is_better = None
self.eps = eps
self.last_epoch = -1
self._init_is_better(mode=mode,
threshold=threshold,
threshold_mode=threshold_mode)
self._reset()

def _reset(self):
r"""Resets num_bad_epochs counter and cooldown counter."""
self.best = self.mode_worse
self.cooldown_counter = 0
self.num_bad_epochs = 0

def step(self, metric, epoch=None):
r"""Function to be executed after model evaluation

Parameters
----------
metric : float
Current metric value to mesure model performance.
epoch : int, default None
Current epoch. If None, it is managed internally.

"""
current = float(metric)
if epoch is None:
epoch = self.last_epoch = self.last_epoch + 1
self.last_epoch = epoch

if self.is_better(current, self.best):
self.best = current
self.num_bad_epochs = 0
else:
self.num_bad_epochs += 1

if self.in_cooldown:
self.cooldown_counter -= 1
self.num_bad_epochs = 0

if self.num_bad_epochs > self.patience:
self._reduce_lr(epoch)
self.cooldown_counter = self.cooldown
self.num_bad_epochs = 0

def _reduce_lr(self, epoch):
old_lr = float(self.trainer.learning_rate)
new_lr = max(old_lr * self.factor, self.min_lr)
if old_lr - new_lr > self.eps:
self.trainer.set_learning_rate(new_lr)
if self.verbose:
print('Epoch {:5d}: reducing learning rate'
' {} to {}.'.format(epoch, old_lr, new_lr))

@property
def in_cooldown(self):
return self.cooldown_counter > 0

def _cmp(self, mode, threshold_mode, threshold, a, best):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current design does not look very scalable. That means it requires hard coding if we would like add some changes/schedules.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the comments.
I can consider more flexible class desine.

if mode == 'min' and threshold_mode == 'rel':
rel_epsilon = 1. - threshold
return a < best * rel_epsilon

elif mode == 'min' and threshold_mode == 'abs':
return a < best - threshold

elif mode == 'max' and threshold_mode == 'rel':
rel_epsilon = threshold + 1.
return a > best * rel_epsilon

else: # mode == 'max' and epsilon_mode == 'abs':
return a > best + threshold

def _init_is_better(self, mode, threshold, threshold_mode):
if mode not in {'min', 'max'}:
raise ValueError('mode ' + mode + ' is unknown!')
if threshold_mode not in {'rel', 'abs'}:
raise ValueError('threshold mode ' + threshold_mode +
' is unknown!')

if mode == 'min':
self.mode_worse = np.Inf
else: # mode == 'max':
self.mode_worse = -np.Inf

self.is_better = partial(self._cmp, mode, threshold_mode, threshold)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you want to use partial? mode, threshold_mode, threshold are simply class variables, so you can simply use them by adding self. to the prefix.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. I will fix.

40 changes: 40 additions & 0 deletions tests/unittest/test_lr_scheduler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# coding: utf-8

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from mxnet import gluon

import gluonnlp as nlp


def testReduceLROnPlateau():
model = gluon.nn.Dense(2)
model.initialize()
trainer = gluon.Trainer(model.collect_params(), 'SGD')
scheduler = nlp.lr_scheduler.ReduceLROnPlateau(trainer,
'min',
patience=0,
factor=0.1)
base_loss = 0.1
scheduler.step(base_loss)
base_lr = scheduler.trainer.learning_rate
next_loss = 0.11
scheduler.step(next_loss)
next_lr = scheduler.trainer.learning_rate
expected_lr = base_lr * 0.1
assert expected_lr == next_lr