diff --git a/ludwig/modules/lr_scheduler.py b/ludwig/modules/lr_scheduler.py index 493b93e277a..ef0f2d257e7 100644 --- a/ludwig/modules/lr_scheduler.py +++ b/ludwig/modules/lr_scheduler.py @@ -1,9 +1,9 @@ import logging import math -from typing import Any, Callable, Dict +from typing import Any, Dict from torch.optim import Optimizer -from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, LambdaLR, ReduceLROnPlateau, SequentialLR +from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau from ludwig.constants import MINIMIZE, TRAINING, VALIDATION from ludwig.modules.metric_registry import get_metric_objective @@ -166,29 +166,14 @@ def get_schedule_with_warmup( step_info: StepInfo, ) -> LambdaLR: """Creates a learning rate scheduler that updates each training step.""" - schedulers = [] + decay_fn = decay_registry[config.decay] - # Warmup scheduler - if step_info.num_warmup_steps > 0: - warmup_scheduler = LambdaLR( - optimizer, - lambda current_step: float(current_step) / float(max(1, step_info.num_warmup_steps)), - last_epoch=-1, - ) - schedulers.append(warmup_scheduler) + def lr_lambda(current_step: int): + if current_step < step_info.num_warmup_steps: + return float(current_step) / float(max(1, step_info.num_warmup_steps)) + return decay_fn(current_step, step_info.num_training_steps, step_info.num_warmup_steps, config) - # Decay scheduler - decay = config.decay - decay_scheduler = decay_registry[decay](config, optimizer, step_info) - schedulers.append(decay_scheduler) - - if len(schedulers) == 1: - # Only one scheduler, no need to wrap in a SequentialLR - return schedulers[0] - - # Return a SequentialLR that applies the warmup and decay schedulers in order - # with the warmup scheduler only applied for the first num_warmup_steps steps. - return SequentialLR(optimizer, schedulers=schedulers, milestones=[step_info.num_warmup_steps], last_epoch=-1) + return LambdaLR(optimizer, lr_lambda, last_epoch=-1) def no_decay(current_step: int, num_training_steps: int, num_warmup_steps: int, config: LRSchedulerConfig): @@ -196,11 +181,7 @@ def no_decay(current_step: int, num_training_steps: int, num_warmup_steps: int, def linear_decay(current_step: int, num_training_steps: int, num_warmup_steps: int, config: LRSchedulerConfig): - return max( - 0.0, - float(num_training_steps - num_warmup_steps - current_step) - / float(max(1, num_training_steps - num_warmup_steps)), - ) + return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) def exponential_decay(current_step: int, num_training_steps: int, num_warmup_steps: int, config: LRSchedulerConfig): @@ -213,36 +194,8 @@ def exponential_decay(current_step: int, num_training_steps: int, num_warmup_ste return math.pow(decay_rate, exponent) -def wrap_decay_fn(decay_fn: Callable) -> Callable: - def init_fn(config: LRSchedulerConfig, optimizer: Optimizer, step_info: StepInfo) -> LambdaLR: - return LambdaLR( - optimizer, - lambda current_step: decay_fn( - current_step, step_info.num_training_steps, step_info.num_warmup_steps, config - ), - last_epoch=-1, - ) - - return init_fn - - -def init_cosine_decay( - config: LRSchedulerConfig, - optimizer: Optimizer, - step_info: StepInfo, -) -> CosineAnnealingWarmRestarts: - return CosineAnnealingWarmRestarts( - optimizer, - T_0=config.t_0 or step_info.steps_per_checkpoint, - T_mult=config.t_mult or 1, - eta_min=config.eta_min or 0, - last_epoch=-1, - ) - - decay_registry = { - None: wrap_decay_fn(no_decay), - "linear": wrap_decay_fn(linear_decay), - "exponential": wrap_decay_fn(exponential_decay), - "cosine": init_cosine_decay, + None: no_decay, + "linear": linear_decay, + "exponential": exponential_decay, } diff --git a/ludwig/schema/lr_scheduler.py b/ludwig/schema/lr_scheduler.py index 3bfedab82bf..e102274c65d 100644 --- a/ludwig/schema/lr_scheduler.py +++ b/ludwig/schema/lr_scheduler.py @@ -17,7 +17,7 @@ class LRSchedulerConfig(schema_utils.BaseMarshmallowConfig, ABC): """Configuration for learning rate scheduler parameters.""" decay: str = schema_utils.StringOptions( - options=["linear", "exponential", "cosine"], + options=["linear", "exponential"], default=None, allow_none=True, description="Turn on decay of the learning rate.", @@ -99,32 +99,6 @@ class LRSchedulerConfig(schema_utils.BaseMarshmallowConfig, ABC): parameter_metadata=TRAINER_METADATA[MODEL_ECD]["learning_rate_scheduler"]["reduce_eval_split"], ) - # Parameters for CosineAnnealingWarmRestarts scheduler - - t_0: int = schema_utils.PositiveInteger( - default=None, - allow_none=True, - description="Number of steps before the first restart for cosine annealing decay. If not specified, it" - " will be set to `steps_per_checkpoint`.", - parameter_metadata=TRAINER_METADATA[MODEL_ECD]["learning_rate_scheduler"]["t_0"], - ) - - t_mult: int = schema_utils.PositiveInteger( - default=1, - description="Period multiplier after each restart for cosine annealing decay. Defaults to 1, i.e.," - " restart every `t_0` steps. If set to a larger value, the period between restarts increases by that" - " multiplier. For e.g., if t_mult is 2, then the periods would be: t_0, 2*t_0, 2^2*t_0, 2^3*t_0, etc.", - parameter_metadata=TRAINER_METADATA[MODEL_ECD]["learning_rate_scheduler"]["t_mult"], - ) - - eta_min: float = schema_utils.FloatRange( - default=0, - min=0, - max=1, - description="Minimum learning rate allowed for cosine annealing decay. Default: 0.", - parameter_metadata=TRAINER_METADATA[MODEL_ECD]["learning_rate_scheduler"]["eta_min"], - ) - # TODO(travis): too much boilerplate here, we should find a way to abstract all this and only require specifying the # minimal amount needed for the new config object. diff --git a/ludwig/schema/metadata/configs/trainer.yaml b/ludwig/schema/metadata/configs/trainer.yaml index 59264676a09..f97bf6b7905 100644 --- a/ludwig/schema/metadata/configs/trainer.yaml +++ b/ludwig/schema/metadata/configs/trainer.yaml @@ -520,10 +520,7 @@ ecd: suggested_values_reasoning: Starting with exponential decay is a safe place to start, as it is a "softer" decrease in the learning rate over time, as compared with linear, which is more steep after the initial drop. Linear decay is - most useful when the risk of catastrophic forgetting is very high (e.g, for fine-tuning pretrained - models). Cosine annealing is a type of learning rate schedule that has the effect of starting with a - large learning rate that is relatively rapidly decreased to a minimum value before being increased - rapidly again. The resetting of the learning rate acts like a simulated restart of the learning process. + most useful when the risk of catastrophic forgetting is very high (e.g, for fine-tuning pretrained models). If you observe your loss curves shooting up (even on the training set) in later epochs, increasing the decay rate may help mitigate this effect. ui_display_name: Decay @@ -603,15 +600,6 @@ ecd: reduce_eval_split: expected_impact: 1 ui_display_name: Reduce Eval Split - t_0: - expected_impact: 1 - ui_display_name: T_0 - t_mult: - expected_impact: 1 - ui_display_name: T_mult - eta_min: - expected_impact: 1 - ui_display_name: Eta Min gbm: learning_rate: commonly_used: true diff --git a/tests/ludwig/modules/test_lr_scheduler.py b/tests/ludwig/modules/test_lr_scheduler.py index 8ac19e606f4..e19e786e722 100644 --- a/tests/ludwig/modules/test_lr_scheduler.py +++ b/tests/ludwig/modules/test_lr_scheduler.py @@ -1,5 +1,3 @@ -import math - import numpy as np from torch.optim import SGD @@ -35,11 +33,6 @@ def test_lr_scheduler_warmup_decay(): exp_scheduler = LRScheduler(config=exp_config, optimizer=exp_optimizer) exp_scheduler.reset(steps_per_checkpoint, total_steps) - cosine_optimizer = SGD(module.parameters(), lr=base_lr) - cosine_config = LRSchedulerConfig(warmup_fraction=warmup_fraction, decay="cosine", t_0=steps_per_checkpoint) - cosine_scheduler = LRScheduler(config=cosine_config, optimizer=cosine_optimizer) - cosine_scheduler.reset(steps_per_checkpoint, total_steps) - warmup_steps = total_steps * warmup_fraction for i in range(total_steps): # Offset by 1 @@ -55,25 +48,17 @@ def test_lr_scheduler_warmup_decay(): exp_scheduler.step() exp_lr = exp_optimizer.param_groups[0]["lr"] - cosine_scheduler.step() - cosine_lr = cosine_optimizer.param_groups[0]["lr"] - if step < warmup_steps: assert linear_lr == exp_lr, f"step: {step}" - assert linear_lr == cosine_lr, f"step: {step}" assert linear_lr < base_lr, f"step: {step}" elif step == warmup_steps: assert linear_lr == base_lr, f"step: {step}" - assert cosine_lr == base_lr, f"step: {step}" assert exp_lr < base_lr, f"step: {step}" else: assert linear_lr < base_lr, f"step: {step}" assert exp_lr < base_lr, f"step: {step}" - assert cosine_lr <= base_lr, f"step: {step}" assert linear_lr < exp_lr - assert exp_lr < cosine_lr - assert cosine_lr == base_lr def test_lr_scheduler_reduce_on_plateau(): @@ -134,75 +119,6 @@ def test_lr_scheduler_reduce_on_plateau(): assert np.isclose(lr, 0.001) -def test_lr_scheduler_cosine_decay_fixed_period(): - total_steps = 10000 - steps_per_checkpoint = 1000 - base_lr = 1.0 - - module = NumberInputFeature(NumberInputFeatureConfig(name="num1", encoder=DenseEncoderConfig())) - - optimizer = SGD(module.parameters(), lr=base_lr) - config = LRSchedulerConfig(decay="cosine", t_0=steps_per_checkpoint, decay_rate=0, reduce_on_plateau=0) - scheduler = LRScheduler(config=config, optimizer=optimizer) - scheduler.reset(steps_per_checkpoint, total_steps) - - curr_lr = base_lr - prev_lr = base_lr - num_restarts = 0 - for step in range(total_steps + 1): - # Cosine annealing formula - expected_lr = base_lr * 0.5 * (1 + math.cos(math.pi * (step % steps_per_checkpoint) / steps_per_checkpoint)) - assert np.isclose(curr_lr, expected_lr), f"step: {step}" - - if prev_lr < curr_lr: - # Since Cosine decay is periodic, we should see the learning rate - # decrease and then increase again. - num_restarts += 1 - - prev_lr = curr_lr - scheduler.step() - - curr_lr = optimizer.param_groups[0]["lr"] - - assert num_restarts == 10, f"num_restarts: {num_restarts}" - - -def test_lr_scheduler_cosine_decay_increasing_period(): - total_steps = 20000 - steps_per_checkpoint = 1000 - base_lr = 1.0 - - module = NumberInputFeature(NumberInputFeatureConfig(name="num1", encoder=DenseEncoderConfig())) - - optimizer = SGD(module.parameters(), lr=base_lr) - config = LRSchedulerConfig( - decay="cosine", - t_0=steps_per_checkpoint, - t_mult=2, - decay_rate=0, - reduce_on_plateau=0, - ) - scheduler = LRScheduler(config=config, optimizer=optimizer) - scheduler.reset(steps_per_checkpoint, total_steps) - - curr_lr = base_lr - prev_lr = base_lr - num_restarts = 0 - for _ in range(total_steps + 1): - if prev_lr < curr_lr: - # Since Cosine decay is periodic, we should see the learning rate - # decrease and then increase again. - num_restarts += 1 - - prev_lr = curr_lr - scheduler.step() - - curr_lr = optimizer.param_groups[0]["lr"] - - # 1000, 3000, 6000, 12000, 24000 (but we stop at 20000) - assert num_restarts == 4, f"num_restarts: {num_restarts}" - - def test_lr_scheduler_save_load(): steps_per_checkpoint = 10 total_steps = 100