diff --git a/ludwig/modules/lr_scheduler.py b/ludwig/modules/lr_scheduler.py
index 493b93e277a..ef0f2d257e7 100644
--- a/ludwig/modules/lr_scheduler.py
+++ b/ludwig/modules/lr_scheduler.py
@@ -1,9 +1,9 @@
 import logging
 import math
-from typing import Any, Callable, Dict
+from typing import Any, Dict
 
 from torch.optim import Optimizer
-from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, LambdaLR, ReduceLROnPlateau, SequentialLR
+from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
 
 from ludwig.constants import MINIMIZE, TRAINING, VALIDATION
 from ludwig.modules.metric_registry import get_metric_objective
@@ -166,29 +166,14 @@ def get_schedule_with_warmup(
     step_info: StepInfo,
 ) -> LambdaLR:
     """Creates a learning rate scheduler that updates each training step."""
-    schedulers = []
+    decay_fn = decay_registry[config.decay]
 
-    # Warmup scheduler
-    if step_info.num_warmup_steps > 0:
-        warmup_scheduler = LambdaLR(
-            optimizer,
-            lambda current_step: float(current_step) / float(max(1, step_info.num_warmup_steps)),
-            last_epoch=-1,
-        )
-        schedulers.append(warmup_scheduler)
+    def lr_lambda(current_step: int):
+        if current_step < step_info.num_warmup_steps:
+            return float(current_step) / float(max(1, step_info.num_warmup_steps))
+        return decay_fn(current_step, step_info.num_training_steps, step_info.num_warmup_steps, config)
 
-    # Decay scheduler
-    decay = config.decay
-    decay_scheduler = decay_registry[decay](config, optimizer, step_info)
-    schedulers.append(decay_scheduler)
-
-    if len(schedulers) == 1:
-        # Only one scheduler, no need to wrap in a SequentialLR
-        return schedulers[0]
-
-    # Return a SequentialLR that applies the warmup and decay schedulers in order
-    # with the warmup scheduler only applied for the first num_warmup_steps steps.
-    return SequentialLR(optimizer, schedulers=schedulers, milestones=[step_info.num_warmup_steps], last_epoch=-1)
+    return LambdaLR(optimizer, lr_lambda, last_epoch=-1)
 
 
 def no_decay(current_step: int, num_training_steps: int, num_warmup_steps: int, config: LRSchedulerConfig):
@@ -196,11 +181,7 @@ def no_decay(current_step: int, num_training_steps: int, num_warmup_steps: int,
 
 
 def linear_decay(current_step: int, num_training_steps: int, num_warmup_steps: int, config: LRSchedulerConfig):
-    return max(
-        0.0,
-        float(num_training_steps - num_warmup_steps - current_step)
-        / float(max(1, num_training_steps - num_warmup_steps)),
-    )
+    return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
 
 
 def exponential_decay(current_step: int, num_training_steps: int, num_warmup_steps: int, config: LRSchedulerConfig):
@@ -213,36 +194,8 @@ def exponential_decay(current_step: int, num_training_steps: int, num_warmup_ste
     return math.pow(decay_rate, exponent)
 
 
-def wrap_decay_fn(decay_fn: Callable) -> Callable:
-    def init_fn(config: LRSchedulerConfig, optimizer: Optimizer, step_info: StepInfo) -> LambdaLR:
-        return LambdaLR(
-            optimizer,
-            lambda current_step: decay_fn(
-                current_step, step_info.num_training_steps, step_info.num_warmup_steps, config
-            ),
-            last_epoch=-1,
-        )
-
-    return init_fn
-
-
-def init_cosine_decay(
-    config: LRSchedulerConfig,
-    optimizer: Optimizer,
-    step_info: StepInfo,
-) -> CosineAnnealingWarmRestarts:
-    return CosineAnnealingWarmRestarts(
-        optimizer,
-        T_0=config.t_0 or step_info.steps_per_checkpoint,
-        T_mult=config.t_mult or 1,
-        eta_min=config.eta_min or 0,
-        last_epoch=-1,
-    )
-
-
 decay_registry = {
-    None: wrap_decay_fn(no_decay),
-    "linear": wrap_decay_fn(linear_decay),
-    "exponential": wrap_decay_fn(exponential_decay),
-    "cosine": init_cosine_decay,
+    None: no_decay,
+    "linear": linear_decay,
+    "exponential": exponential_decay,
 }
diff --git a/ludwig/schema/lr_scheduler.py b/ludwig/schema/lr_scheduler.py
index 3bfedab82bf..e102274c65d 100644
--- a/ludwig/schema/lr_scheduler.py
+++ b/ludwig/schema/lr_scheduler.py
@@ -17,7 +17,7 @@ class LRSchedulerConfig(schema_utils.BaseMarshmallowConfig, ABC):
     """Configuration for learning rate scheduler parameters."""
 
     decay: str = schema_utils.StringOptions(
-        options=["linear", "exponential", "cosine"],
+        options=["linear", "exponential"],
         default=None,
         allow_none=True,
         description="Turn on decay of the learning rate.",
@@ -99,32 +99,6 @@ class LRSchedulerConfig(schema_utils.BaseMarshmallowConfig, ABC):
         parameter_metadata=TRAINER_METADATA[MODEL_ECD]["learning_rate_scheduler"]["reduce_eval_split"],
     )
 
-    # Parameters for CosineAnnealingWarmRestarts scheduler
-
-    t_0: int = schema_utils.PositiveInteger(
-        default=None,
-        allow_none=True,
-        description="Number of steps before the first restart for cosine annealing decay. If not specified, it"
-        " will be set to `steps_per_checkpoint`.",
-        parameter_metadata=TRAINER_METADATA[MODEL_ECD]["learning_rate_scheduler"]["t_0"],
-    )
-
-    t_mult: int = schema_utils.PositiveInteger(
-        default=1,
-        description="Period multiplier after each restart for cosine annealing decay. Defaults to 1, i.e.,"
-        " restart every `t_0` steps. If set to a larger value, the period between restarts increases by that"
-        " multiplier. For e.g., if t_mult is 2, then the periods would be: t_0, 2*t_0, 2^2*t_0, 2^3*t_0, etc.",
-        parameter_metadata=TRAINER_METADATA[MODEL_ECD]["learning_rate_scheduler"]["t_mult"],
-    )
-
-    eta_min: float = schema_utils.FloatRange(
-        default=0,
-        min=0,
-        max=1,
-        description="Minimum learning rate allowed for cosine annealing decay. Default: 0.",
-        parameter_metadata=TRAINER_METADATA[MODEL_ECD]["learning_rate_scheduler"]["eta_min"],
-    )
-
 
 # TODO(travis): too much boilerplate here, we should find a way to abstract all this and only require specifying the
 # minimal amount needed for the new config object.
diff --git a/ludwig/schema/metadata/configs/trainer.yaml b/ludwig/schema/metadata/configs/trainer.yaml
index 59264676a09..f97bf6b7905 100644
--- a/ludwig/schema/metadata/configs/trainer.yaml
+++ b/ludwig/schema/metadata/configs/trainer.yaml
@@ -520,10 +520,7 @@ ecd:
             suggested_values_reasoning:
                 Starting with exponential decay is a safe place to start, as it is a "softer" decrease in the learning
                 rate over time, as compared with linear, which is more steep after the initial drop. Linear decay is
-                most useful when the risk of catastrophic forgetting is very high (e.g, for fine-tuning pretrained
-                models). Cosine annealing is a type of learning rate schedule that has the effect of starting with a
-                large learning rate that is relatively rapidly decreased to a minimum value before being increased
-                rapidly again. The resetting of the learning rate acts like a simulated restart of the learning process.
+                most useful when the risk of catastrophic forgetting is very high (e.g, for fine-tuning pretrained models).
                 If you observe your loss curves shooting up (even on the training set) in later epochs, increasing the
                 decay rate may help mitigate this effect.
             ui_display_name: Decay
@@ -603,15 +600,6 @@ ecd:
         reduce_eval_split:
             expected_impact: 1
             ui_display_name: Reduce Eval Split
-        t_0:
-            expected_impact: 1
-            ui_display_name: T_0
-        t_mult:
-            expected_impact: 1
-            ui_display_name: T_mult
-        eta_min:
-            expected_impact: 1
-            ui_display_name: Eta Min
 gbm:
     learning_rate:
         commonly_used: true
diff --git a/tests/ludwig/modules/test_lr_scheduler.py b/tests/ludwig/modules/test_lr_scheduler.py
index 8ac19e606f4..e19e786e722 100644
--- a/tests/ludwig/modules/test_lr_scheduler.py
+++ b/tests/ludwig/modules/test_lr_scheduler.py
@@ -1,5 +1,3 @@
-import math
-
 import numpy as np
 from torch.optim import SGD
 
@@ -35,11 +33,6 @@ def test_lr_scheduler_warmup_decay():
     exp_scheduler = LRScheduler(config=exp_config, optimizer=exp_optimizer)
     exp_scheduler.reset(steps_per_checkpoint, total_steps)
 
-    cosine_optimizer = SGD(module.parameters(), lr=base_lr)
-    cosine_config = LRSchedulerConfig(warmup_fraction=warmup_fraction, decay="cosine", t_0=steps_per_checkpoint)
-    cosine_scheduler = LRScheduler(config=cosine_config, optimizer=cosine_optimizer)
-    cosine_scheduler.reset(steps_per_checkpoint, total_steps)
-
     warmup_steps = total_steps * warmup_fraction
     for i in range(total_steps):
         # Offset by 1
@@ -55,25 +48,17 @@ def test_lr_scheduler_warmup_decay():
         exp_scheduler.step()
         exp_lr = exp_optimizer.param_groups[0]["lr"]
 
-        cosine_scheduler.step()
-        cosine_lr = cosine_optimizer.param_groups[0]["lr"]
-
         if step < warmup_steps:
             assert linear_lr == exp_lr, f"step: {step}"
-            assert linear_lr == cosine_lr, f"step: {step}"
             assert linear_lr < base_lr, f"step: {step}"
         elif step == warmup_steps:
             assert linear_lr == base_lr, f"step: {step}"
-            assert cosine_lr == base_lr, f"step: {step}"
             assert exp_lr < base_lr, f"step: {step}"
         else:
             assert linear_lr < base_lr, f"step: {step}"
             assert exp_lr < base_lr, f"step: {step}"
-            assert cosine_lr <= base_lr, f"step: {step}"
 
     assert linear_lr < exp_lr
-    assert exp_lr < cosine_lr
-    assert cosine_lr == base_lr
 
 
 def test_lr_scheduler_reduce_on_plateau():
@@ -134,75 +119,6 @@ def test_lr_scheduler_reduce_on_plateau():
     assert np.isclose(lr, 0.001)
 
 
-def test_lr_scheduler_cosine_decay_fixed_period():
-    total_steps = 10000
-    steps_per_checkpoint = 1000
-    base_lr = 1.0
-
-    module = NumberInputFeature(NumberInputFeatureConfig(name="num1", encoder=DenseEncoderConfig()))
-
-    optimizer = SGD(module.parameters(), lr=base_lr)
-    config = LRSchedulerConfig(decay="cosine", t_0=steps_per_checkpoint, decay_rate=0, reduce_on_plateau=0)
-    scheduler = LRScheduler(config=config, optimizer=optimizer)
-    scheduler.reset(steps_per_checkpoint, total_steps)
-
-    curr_lr = base_lr
-    prev_lr = base_lr
-    num_restarts = 0
-    for step in range(total_steps + 1):
-        # Cosine annealing formula
-        expected_lr = base_lr * 0.5 * (1 + math.cos(math.pi * (step % steps_per_checkpoint) / steps_per_checkpoint))
-        assert np.isclose(curr_lr, expected_lr), f"step: {step}"
-
-        if prev_lr < curr_lr:
-            # Since Cosine decay is periodic, we should see the learning rate
-            # decrease and then increase again.
-            num_restarts += 1
-
-        prev_lr = curr_lr
-        scheduler.step()
-
-        curr_lr = optimizer.param_groups[0]["lr"]
-
-    assert num_restarts == 10, f"num_restarts: {num_restarts}"
-
-
-def test_lr_scheduler_cosine_decay_increasing_period():
-    total_steps = 20000
-    steps_per_checkpoint = 1000
-    base_lr = 1.0
-
-    module = NumberInputFeature(NumberInputFeatureConfig(name="num1", encoder=DenseEncoderConfig()))
-
-    optimizer = SGD(module.parameters(), lr=base_lr)
-    config = LRSchedulerConfig(
-        decay="cosine",
-        t_0=steps_per_checkpoint,
-        t_mult=2,
-        decay_rate=0,
-        reduce_on_plateau=0,
-    )
-    scheduler = LRScheduler(config=config, optimizer=optimizer)
-    scheduler.reset(steps_per_checkpoint, total_steps)
-
-    curr_lr = base_lr
-    prev_lr = base_lr
-    num_restarts = 0
-    for _ in range(total_steps + 1):
-        if prev_lr < curr_lr:
-            # Since Cosine decay is periodic, we should see the learning rate
-            # decrease and then increase again.
-            num_restarts += 1
-
-        prev_lr = curr_lr
-        scheduler.step()
-
-        curr_lr = optimizer.param_groups[0]["lr"]
-
-    # 1000, 3000, 6000, 12000, 24000 (but we stop at 20000)
-    assert num_restarts == 4, f"num_restarts: {num_restarts}"
-
-
 def test_lr_scheduler_save_load():
     steps_per_checkpoint = 10
     total_steps = 100