diff --git a/benchmarks/fp8/torchao/Dockerfile b/benchmarks/fp8/torchao/Dockerfile
new file mode 100644
index 00000000000..88c21934d4e
--- /dev/null
+++ b/benchmarks/fp8/torchao/Dockerfile
@@ -0,0 +1,12 @@
+FROM nvcr.io/nvidia/pytorch:24.07-py3
+
+RUN pip install transformers evaluate datasets
+RUN git clone https://github.com/huggingface/accelerate.git
+
+RUN cd accelerate && \
+    pip install -e . && \
+    cd benchmarks/fp8
+
+RUN /bin/bash
+
+
diff --git a/benchmarks/fp8/torchao/README.md b/benchmarks/fp8/torchao/README.md
new file mode 100644
index 00000000000..d5abadaf64e
--- /dev/null
+++ b/benchmarks/fp8/torchao/README.md
@@ -0,0 +1,32 @@
+# FP8 Benchmarks
+
+Comparing and running [torchao](https://github.com/pytorch/ao/tree/main/torchao/float8) FP8 with accelerate
+
+## Overview
+
+This repo provides scripts which compare native `torchao` model training against `accelerate`'s own integration. Each modeling type is segmented out via a script, supporting the following:
+
+* Single GPU training (`non_distributed.py`)
+* Multi-GPU training via DistributedDataParallelism (`ddp.py`)
+* Fully Sharded Data Parallelism (`fsdp.py`)
+* DeepSpeed ZeRO 1-3 (`deepspeed.py`)
+
+To run them, it's recommended to use a docker image (see the attached `Dockerfile`) and not install `torchao` manually.
+
+## Running:
+
+There are official Docker images located at `huggingface/accelerate:gpu-fp8-torchao-nightly` which can be used.
+
+You can run all scripts using the core `accelerate launch` command without any `accelerate config` being needed.
+
+For single GPU, run it via `python`:
+
+```bash
+python non_distributed.py
+```
+
+For the rest, run it via `accelerate launch`:
+
+```bash
+accelerate launch ddp.py # or distrib_deepspeed.py, ddp.py
+```
\ No newline at end of file
diff --git a/benchmarks/fp8/torchao/ddp.py b/benchmarks/fp8/torchao/ddp.py
new file mode 100644
index 00000000000..5cb125b56b2
--- /dev/null
+++ b/benchmarks/fp8/torchao/ddp.py
@@ -0,0 +1,158 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script tests to ensure that `accelerate` performs at the same level as raw `torchao`.
+
+This particular script verifies this for DDP training.
+"""
+
+from functools import partial
+
+import evaluate
+import torch
+from fp8_utils import get_training_utilities
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torchao.float8 import convert_to_float8_training
+
+from accelerate import Accelerator
+from accelerate.state import AcceleratorState
+from accelerate.utils import AORecipeKwargs, set_seed
+
+
+MODEL_NAME = "bert-base-cased"
+METRIC = evaluate.load("glue", "mrpc")
+
+
+def evaluate_model(model, dataloader, metric, accelerator=None):
+    "Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
+    model.eval()
+    for step, batch in enumerate(dataloader):
+        with torch.no_grad():
+            outputs = model(**batch)
+        predictions = outputs.logits.argmax(dim=-1)
+        references = batch["labels"]
+        if accelerator is not None and accelerator.num_processes > 1:
+            predictions, references = accelerator.gather_for_metrics((predictions, references))
+        metric.add_batch(predictions=predictions, references=references)
+    return metric.compute()
+
+
+def filter_linear_layers(module, fqn, first_layer_name=None, last_layer_name=None):
+    if isinstance(module, torch.nn.Linear):
+        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
+            return False
+    # For stability reasons, we skip the first and last linear layers
+    # Otherwise can lead to the model not training or converging properly
+    if fqn in (first_layer_name, last_layer_name):
+        return False
+    return True
+
+
+def train_baseline():
+    set_seed(42)
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
+    first_linear = None
+    last_linear = None
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            if first_linear is None:
+                first_linear = name
+            last_linear = name
+    func = partial(filter_linear_layers, first_layer_name=first_linear, last_layer_name=last_linear)
+    accelerator = Accelerator()
+    device = accelerator.device
+    model.to(device)
+
+    convert_to_float8_training(model, module_filter_fn=func)
+
+    # Convert the model to DDP
+    device_ids, output_device = [accelerator.local_process_index], accelerator.local_process_index
+    model = DDP(model, device_ids=device_ids, output_device=output_device)
+
+    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+    model.train()
+
+    for batch in train_dataloader:
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            batch = batch.to(device)
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        lr_scheduler.step()
+
+    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+
+    assert (
+        trained_model_results["accuracy"] > base_model_results["accuracy"]
+    ), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
+    assert (
+        trained_model_results["f1"] > base_model_results["f1"]
+    ), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
+
+    return base_model_results, trained_model_results
+
+
+def train_integration():
+    AcceleratorState()._reset_state(True)
+    accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=[AORecipeKwargs()])
+    set_seed(42)
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
+        MODEL_NAME, accelerator=accelerator
+    )
+
+    model, optimizer = accelerator.prepare(model, optimizer)
+    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+    model.train()
+
+    for batch in train_dataloader:
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+        lr_scheduler.step()
+
+    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+
+    assert (
+        trained_model_results["accuracy"] > base_model_results["accuracy"]
+    ), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
+    assert (
+        trained_model_results["f1"] > base_model_results["f1"]
+    ), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
+
+    return base_model_results, trained_model_results
+
+
+if __name__ == "__main__":
+    baseline_not_trained, baseline_trained = train_baseline()
+    accelerator_not_trained, accelerator_trained = train_integration()
+
+    assert (
+        baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
+    ), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
+    assert (
+        baseline_not_trained["f1"] == accelerator_not_trained["f1"]
+    ), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
+    assert (
+        baseline_trained["accuracy"] == accelerator_trained["accuracy"]
+    ), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
+    assert (
+        baseline_trained["f1"] == accelerator_trained["f1"]
+    ), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
+
+    torch.distributed.destroy_process_group()
diff --git a/benchmarks/fp8/torchao/distrib_deepspeed.py b/benchmarks/fp8/torchao/distrib_deepspeed.py
new file mode 100644
index 00000000000..6fc2080b304
--- /dev/null
+++ b/benchmarks/fp8/torchao/distrib_deepspeed.py
@@ -0,0 +1,213 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script tests to ensure that `accelerate` performs at the same level as raw `torchao`.
+
+This particular script verifies this for deepspeed training.
+"""
+
+from functools import partial
+from unittest.mock import patch
+
+import deepspeed
+import evaluate
+import torch
+from fp8_utils import evaluate_model, get_training_utilities
+from torchao.float8 import convert_to_float8_training
+from transformers.integrations import HfDeepSpeedConfig
+
+from accelerate import Accelerator, DeepSpeedPlugin
+from accelerate.state import AcceleratorState
+from accelerate.utils import AORecipeKwargs, set_seed
+
+
+MODEL_NAME = "bert-base-cased"
+METRIC = evaluate.load("glue", "mrpc")
+
+
+def filter_linear_layers(module, fqn, first_layer_name=None, last_layer_name=None):
+    if isinstance(module, torch.nn.Linear):
+        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
+            return False
+    # For stability reasons, we skip the first and last linear layers
+    # Otherwise can lead to the model not training or converging properly
+    if fqn in (first_layer_name, last_layer_name):
+        return False
+    return True
+
+
+def train_baseline(zero_stage: int = 1):
+    set_seed(42)
+    # This forces transformers to think Zero-3 Init should be used
+    with patch("transformers.integrations.deepspeed.is_deepspeed_zero3_enabled") as mock:
+        mock.return_value = zero_stage == 3
+
+    config = HfDeepSpeedConfig(
+        {
+            "train_micro_batch_size_per_gpu": 16,
+            "gradient_accumulation_steps": 1,
+            "zero_optimization": {"stage": zero_stage},
+        }
+    )
+    plugin = DeepSpeedPlugin(hf_ds_config=config)
+    accelerator = Accelerator(deepspeed_plugin=plugin)
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
+        MODEL_NAME, accelerator=accelerator
+    )
+    first_linear = None
+    last_linear = None
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            if first_linear is None:
+                first_linear = name
+            last_linear = name
+    func = partial(filter_linear_layers, first_layer_name=first_linear, last_layer_name=last_linear)
+
+    convert_to_float8_training(model, module_filter_fn=func)
+
+    import numpy as np
+
+    config = {
+        "train_batch_size": 32,
+        "train_micro_batch_size_per_gpu": 16,
+        "gradient_accumulation_steps": 1,
+        "zero_optimization": {
+            "stage": zero_stage,
+            "offload_optimizer": {"device": "none", "nvme_path": None},
+            "offload_param": {"device": "none", "nvme_path": None},
+            "stage3_gather_16bit_weights_on_model_save": False,
+        },
+        "gradient_clipping": 1.0,
+        "steps_per_print": np.inf,
+        "bf16": {"enabled": True},
+        "fp16": {"enabled": False},
+        "zero_allow_untested_optimizer": True,
+    }
+
+    (
+        model,
+        optimizer,
+        _,
+        lr_scheduler,
+    ) = deepspeed.initialize(
+        model=model,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        config_params=config,
+    )
+
+    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+    model.train()
+
+    model_outputs = []
+    data = []
+
+    for batch in train_dataloader:
+        outputs = model(**batch)
+        data.append(batch.to("cpu"))
+        model_outputs.append(outputs.logits.to("cpu"))
+        loss = outputs.loss
+        model.backward(loss)
+        model.step()
+        for _ in range(accelerator.num_processes):
+            lr_scheduler.step()
+
+    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+    model.destroy()
+    assert (
+        trained_model_results["accuracy"] > base_model_results["accuracy"]
+    ), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
+    assert (
+        trained_model_results["f1"] > base_model_results["f1"]
+    ), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
+
+    del config
+    return base_model_results, trained_model_results, model_outputs, data
+
+
+def train_integration(zero_stage: int = 1):
+    set_seed(42)
+    AcceleratorState()._reset_state(True)
+    config = HfDeepSpeedConfig(
+        {
+            "train_micro_batch_size_per_gpu": 16,
+            "gradient_accumulation_steps": 1,
+            "zero_optimization": {"stage": zero_stage},
+        }
+    )
+    deepspeed_plugin = DeepSpeedPlugin(
+        hf_ds_config=config,
+    )
+    # This forces transformers to think Zero-3 Init should be used
+    with patch("transformers.integrations.deepspeed.is_deepspeed_zero3_enabled") as mock:
+        mock.return_value = zero_stage == 3
+    accelerator = Accelerator(
+        mixed_precision="fp8", kwargs_handlers=[AORecipeKwargs()], deepspeed_plugin=deepspeed_plugin
+    )
+
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
+        MODEL_NAME, accelerator=accelerator
+    )
+
+    model, optimizer, lr_scheduler, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, lr_scheduler, train_dataloader, eval_dataloader
+    )
+    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+    model.train()
+    model_outputs = []
+    data = []
+    for batch in train_dataloader:
+        outputs = model(**batch)
+        data.append(batch.to("cpu"))
+        model_outputs.append(outputs.logits.to("cpu"))
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+
+    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+    model.destroy()
+    assert (
+        trained_model_results["accuracy"] > base_model_results["accuracy"]
+    ), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
+    assert (
+        trained_model_results["f1"] > base_model_results["f1"]
+    ), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
+
+    del config
+    return base_model_results, trained_model_results, model_outputs, data
+
+
+if __name__ == "__main__":
+    for zero_stage in [1, 2, 3]:
+        baseline_not_trained, baseline_trained, baseline_outputs, baseline_data = train_baseline(zero_stage)
+        accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(
+            zero_stage
+        )
+        assert (
+            baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
+        ), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
+        assert (
+            baseline_not_trained["f1"] == accelerator_not_trained["f1"]
+        ), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
+        assert (
+            baseline_trained["accuracy"] == accelerator_trained["accuracy"]
+        ), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
+        assert (
+            baseline_trained["f1"] == accelerator_trained["f1"]
+        ), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
+        AcceleratorState()._reset_state(True)
+    torch.distributed.destroy_process_group()
diff --git a/benchmarks/fp8/torchao/fp8_utils.py b/benchmarks/fp8/torchao/fp8_utils.py
new file mode 100644
index 00000000000..1aaa7db5df9
--- /dev/null
+++ b/benchmarks/fp8/torchao/fp8_utils.py
@@ -0,0 +1,116 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+
+def get_dataloaders(model_name: str, batch_size: int = 16):
+    from datasets import load_dataset
+    from torch.utils.data import DataLoader
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    datasets = load_dataset("glue", "mrpc")
+
+    def tokenize_function(examples):
+        # max_length=None => use the model max length (it's actually the default)
+        outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
+        return outputs
+
+    # Apply the method we just defined to all the examples in all the splits of the dataset
+    # starting with the main process first:
+    tokenized_datasets = datasets.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=["idx", "sentence1", "sentence2"],
+    )
+
+    # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
+    # transformers library
+    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+
+    def collate_fn(examples):
+        return tokenizer.pad(
+            examples,
+            padding="longest",
+            pad_to_multiple_of=16,  # Specific for FP8
+            return_tensors="pt",
+        )
+
+    # Instantiate dataloaders.
+    train_dataloader = DataLoader(
+        tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
+    )
+    eval_dataloader = DataLoader(
+        tokenized_datasets["validation"],
+        shuffle=False,
+        collate_fn=collate_fn,
+        batch_size=16,
+        drop_last=True,
+    )
+
+    return train_dataloader, eval_dataloader
+
+
+def get_training_utilities(model_name: str, batch_size: int = 16, accelerator=None, prepare=True):
+    """
+    Returns a tuple of:
+        - Model
+        - Optimizer
+        - Train dataloader (prepared)
+        - Eval dataloader (prepared)
+        - LR Scheduler
+    Suitable for training on the MRPC dataset
+    """
+    from torch.optim import AdamW
+    from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup
+
+    from accelerate import Accelerator
+
+    if accelerator is None:
+        accelerator = Accelerator()
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    train_dataloader, eval_dataloader = get_dataloaders(model_name, batch_size)
+    optimizer = AdamW(model.parameters(), lr=0.0001)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer=optimizer,
+        num_warmup_steps=100,
+        num_training_steps=len(train_dataloader) * 2,
+    )
+    train_dataloader, eval_dataloader = accelerator.prepare(train_dataloader, eval_dataloader)
+    return model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+
+
+def get_named_parameters(model):
+    """
+    Same thing as `Accelerator.get_named_parameters` Returns a list of the named parameters of the model (extracted
+    from parallel)
+    """
+    from accelerate.utils import extract_model_from_parallel
+
+    model = extract_model_from_parallel(model)
+    return {n: p for n, p in model.named_parameters()}
+
+
+def evaluate_model(model, dataloader, metric, accelerator=None):
+    "Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
+    model.eval()
+    for step, batch in enumerate(dataloader):
+        with torch.no_grad():
+            outputs = model(**batch)
+        predictions = outputs.logits.argmax(dim=-1)
+        references = batch["labels"]
+        if accelerator is not None and accelerator.num_processes > 1:
+            predictions, references = accelerator.gather_for_metrics((predictions, references))
+        metric.add_batch(predictions=predictions, references=references)
+    return metric.compute()
diff --git a/benchmarks/fp8/torchao/fsdp.py b/benchmarks/fp8/torchao/fsdp.py
new file mode 100644
index 00000000000..42eedb48bd5
--- /dev/null
+++ b/benchmarks/fp8/torchao/fsdp.py
@@ -0,0 +1,173 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script tests to ensure that `accelerate` performs at the same level as raw `torchao`.
+
+This particular script verifies this for FSDP training.
+"""
+
+from functools import partial
+
+import evaluate
+import torch
+from fp8_utils import get_training_utilities
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from torchao.float8 import convert_to_float8_training
+from transformers.models.bert import BertLayer
+
+from accelerate import Accelerator
+from accelerate import FullyShardedDataParallelPlugin as FSDPPlugin
+from accelerate.state import AcceleratorState
+from accelerate.utils import AORecipeKwargs, set_seed
+
+
+MODEL_NAME = "bert-base-cased"
+METRIC = evaluate.load("glue", "mrpc")
+
+FSDP_WRAP_POLICY = partial(transformer_auto_wrap_policy, transformer_layer_cls={BertLayer})
+
+
+def filter_linear_layers(module, fqn, first_layer_name=None, last_layer_name=None):
+    if isinstance(module, torch.nn.Linear):
+        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
+            return False
+    # For stability reasons, we skip the first and last linear layers
+    # Otherwise can lead to the model not training or converging properly
+    if fqn in (first_layer_name, last_layer_name):
+        return False
+    return True
+
+
+def evaluate_model(model, dataloader, metric, accelerator=None):
+    "Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
+    model.eval()
+    for step, batch in enumerate(dataloader):
+        with torch.no_grad():
+            outputs = model(**batch)
+        predictions = outputs.logits.argmax(dim=-1)
+        references = batch["labels"]
+        if accelerator is not None and accelerator.num_processes > 1:
+            predictions, references = accelerator.gather_for_metrics((predictions, references))
+        metric.add_batch(predictions=predictions, references=references)
+    return metric.compute()
+
+
+def train_baseline():
+    set_seed(42)
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
+    first_linear = None
+    last_linear = None
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            if first_linear is None:
+                first_linear = name
+            last_linear = name
+    func = partial(filter_linear_layers, first_layer_name=first_linear, last_layer_name=last_linear)
+    accelerator = Accelerator()
+    device = accelerator.device
+    model.to(device)
+
+    convert_to_float8_training(model, module_filter_fn=func)
+
+    # Convert the model to FSDP
+    model = FSDP(
+        model,
+        use_orig_params=True,
+        mixed_precision=MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32),
+        auto_wrap_policy=FSDP_WRAP_POLICY,
+    )
+
+    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+    model.train()
+
+    for batch in train_dataloader:
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            batch = batch.to(device)
+            outputs = model(**batch)
+        loss = outputs.loss
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        lr_scheduler.step()
+
+    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+
+    assert (
+        trained_model_results["accuracy"] > base_model_results["accuracy"]
+    ), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
+    assert (
+        trained_model_results["f1"] > base_model_results["f1"]
+    ), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
+
+    return base_model_results, trained_model_results
+
+
+def train_integration():
+    AcceleratorState()._reset_state(True)
+    fsdp_plugin = FSDPPlugin(
+        auto_wrap_policy=FSDP_WRAP_POLICY,
+        use_orig_params=True,
+        mixed_precision_policy=MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32),
+    )
+    accelerator = Accelerator(mixed_precision="fp8", fsdp_plugin=fsdp_plugin, kwargs_handlers=[AORecipeKwargs()])
+    set_seed(42)
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
+        MODEL_NAME, accelerator=accelerator
+    )
+
+    model, optimizer = accelerator.prepare(model, optimizer)
+    base_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+    model.train()
+
+    for batch in train_dataloader:
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+        lr_scheduler.step()
+
+    trained_model_results = evaluate_model(model, eval_dataloader, METRIC, accelerator=accelerator)
+
+    assert (
+        trained_model_results["accuracy"] > base_model_results["accuracy"]
+    ), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
+    assert (
+        trained_model_results["f1"] > base_model_results["f1"]
+    ), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
+
+    return base_model_results, trained_model_results
+
+
+if __name__ == "__main__":
+    baseline_not_trained, baseline_trained = train_baseline()
+    accelerator_not_trained, accelerator_trained = train_integration()
+
+    assert (
+        baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
+    ), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
+    assert (
+        baseline_not_trained["f1"] == accelerator_not_trained["f1"]
+    ), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
+    assert (
+        baseline_trained["accuracy"] == accelerator_trained["accuracy"]
+    ), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
+    assert (
+        baseline_trained["f1"] == accelerator_trained["f1"]
+    ), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
+
+    torch.distributed.destroy_process_group()
diff --git a/benchmarks/fp8/torchao/non_distributed.py b/benchmarks/fp8/torchao/non_distributed.py
new file mode 100644
index 00000000000..7b8e5993e42
--- /dev/null
+++ b/benchmarks/fp8/torchao/non_distributed.py
@@ -0,0 +1,145 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script tests to ensure that `accelerate` performs at the same level as raw `torchao`.
+
+This particular script verifies this for single GPU training.
+"""
+
+from functools import partial
+
+import evaluate
+import torch
+from fp8_utils import get_training_utilities
+from torchao.float8 import convert_to_float8_training
+
+from accelerate import Accelerator
+from accelerate.state import AcceleratorState
+from accelerate.utils import AORecipeKwargs, set_seed
+
+
+MODEL_NAME = "bert-base-cased"
+METRIC = evaluate.load("glue", "mrpc")
+
+
+def evaluate_model(model, dataloader, metric, accelerator=None):
+    "Turns model to .eval(), runs dataloader, calculates metric, then turns eval back on"
+    model.eval()
+    for step, batch in enumerate(dataloader):
+        with torch.no_grad():
+            outputs = model(**batch)
+        predictions = outputs.logits.argmax(dim=-1)
+        references = batch["labels"]
+        if accelerator is not None and accelerator.num_processes > 1:
+            predictions, references = accelerator.gather_for_metrics((predictions, references))
+        metric.add_batch(predictions=predictions, references=references)
+    return metric.compute()
+
+
+def filter_linear_layers(module, fqn, first_layer_name=None, last_layer_name=None):
+    if isinstance(module, torch.nn.Linear):
+        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
+            return False
+    # For stability reasons, we skip the first and last linear layers
+    # Otherwise can lead to the model not training or converging properly
+    if fqn in (first_layer_name, last_layer_name):
+        return False
+    return True
+
+
+def train_baseline():
+    set_seed(42)
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(MODEL_NAME)
+    first_linear = None
+    last_linear = None
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            if first_linear is None:
+                first_linear = name
+            last_linear = name
+
+    func = partial(filter_linear_layers, first_layer_name=first_linear, last_layer_name=last_linear)
+    model.to("cuda")
+    convert_to_float8_training(model, module_filter_fn=func)
+    base_model_results = evaluate_model(model, eval_dataloader, METRIC)
+    model.train()
+
+    for batch in train_dataloader:
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        lr_scheduler.step()
+
+    trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
+
+    assert (
+        trained_model_results["accuracy"] > base_model_results["accuracy"]
+    ), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
+    assert (
+        trained_model_results["f1"] > base_model_results["f1"]
+    ), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
+
+    return base_model_results, trained_model_results
+
+
+def train_integration():
+    set_seed(42)
+    accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=[AORecipeKwargs()])
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = get_training_utilities(
+        MODEL_NAME, accelerator=accelerator
+    )
+    model = accelerator.prepare(model)
+    base_model_results = evaluate_model(model, eval_dataloader, METRIC)
+    model.train()
+
+    for batch in train_dataloader:
+        outputs = model(**batch)
+        loss = outputs.loss
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        lr_scheduler.step()
+
+    trained_model_results = evaluate_model(model, eval_dataloader, METRIC)
+
+    assert (
+        trained_model_results["accuracy"] > base_model_results["accuracy"]
+    ), f'Accuracy should be higher for the trained model: {trained_model_results["accuracy"]} > {base_model_results["accuracy"]}'
+    assert (
+        trained_model_results["f1"] > base_model_results["f1"]
+    ), f'F1 score should be higher for the trained model: {trained_model_results["f1"]} > {base_model_results["f1"]}'
+
+    return base_model_results, trained_model_results
+
+
+if __name__ == "__main__":
+    baseline_not_trained, baseline_trained = train_baseline()
+    AcceleratorState._reset_state(True)
+    accelerator_not_trained, accelerator_trained = train_integration()
+    assert (
+        baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
+    ), f'Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
+    assert (
+        baseline_not_trained["f1"] == accelerator_not_trained["f1"]
+    ), f'F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
+    assert (
+        baseline_trained["accuracy"] == accelerator_trained["accuracy"]
+    ), f'Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
+    assert (
+        baseline_trained["f1"] == accelerator_trained["f1"]
+    ), f'F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
diff --git a/benchmarks/fp8/transformer_engine/distrib_deepspeed.py b/benchmarks/fp8/transformer_engine/distrib_deepspeed.py
index e678deb3659..73953b6793f 100644
--- a/benchmarks/fp8/transformer_engine/distrib_deepspeed.py
+++ b/benchmarks/fp8/transformer_engine/distrib_deepspeed.py
@@ -66,7 +66,7 @@ def train_baseline(zero_stage: int = 1):
     import numpy as np
 
     config = {
-        "train_batch_size": 32,
+        "train_batch_size": 16,
         "train_micro_batch_size_per_gpu": 16,
         "gradient_accumulation_steps": 1,
         "zero_optimization": {
@@ -170,21 +170,22 @@ def train_integration(zero_stage: int = 1):
 
 
 if __name__ == "__main__":
-    # for zero_stage in [1, 2, 3]:
-    zero_stage = 1
-    baseline_not_trained, baseline_trained, baseline_outputs, baseline_data = train_baseline(zero_stage)
-    accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(zero_stage)
-    assert (
-        baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
-    ), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
-    assert (
-        baseline_not_trained["f1"] == accelerator_not_trained["f1"]
-    ), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
-    assert (
-        baseline_trained["accuracy"] == accelerator_trained["accuracy"]
-    ), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
-    assert (
-        baseline_trained["f1"] == accelerator_trained["f1"]
-    ), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
-
-    torch.distributed.destroy_process_group()
+    for zero_stage in [1, 2, 3]:
+        baseline_not_trained, baseline_trained, baseline_outputs, baseline_data = train_baseline(zero_stage)
+        accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(
+            zero_stage
+        )
+        assert (
+            baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
+        ), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
+        assert (
+            baseline_not_trained["f1"] == accelerator_not_trained["f1"]
+        ), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
+        assert (
+            baseline_trained["accuracy"] == accelerator_trained["accuracy"]
+        ), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
+        assert (
+            baseline_trained["f1"] == accelerator_trained["f1"]
+        ), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
+
+        torch.distributed.destroy_process_group()
diff --git a/docs/source/usage_guides/low_precision_training.md b/docs/source/usage_guides/low_precision_training.md
index c730136e1ce..08e533e60cf 100644
--- a/docs/source/usage_guides/low_precision_training.md
+++ b/docs/source/usage_guides/low_precision_training.md
@@ -15,7 +15,7 @@ rendered properly in your Markdown viewer.
 
 # Low Precision Training Methods
 
-Accelerate provides integrations to train on lower precision methods using specified supported hardware through the `TransformersEngine` and `MS-AMP` packages. This documentation will help guide you through what hardware is supported, how to configure your [`Accelerator`] to leverage the low precision methods, and what you can expect when training. 
+Accelerate provides integrations to train on lower precision methods using specified supported hardware through the `TransformersEngine`, `MS-AMP`, and `torchao` packages. This documentation will help guide you through what hardware is supported, how to configure your [`Accelerator`] to leverage the low precision methods, and what you can expect when training. 
 
 ## What training on FP8 means
 
@@ -30,7 +30,7 @@ What this will result in is some gain in the memory used (as we've cut the neede
 
 ## Configuring the Accelerator
 
-Currently two different backends for FP8 are supported (`TransformersEngine` and `MS-AMP`), each with different capabilities and configurations. 
+Currently three different backends for FP8 are supported (`TransformersEngine`, `torchao`, and `MS-AMP`), each with different capabilities and configurations. 
 
 To use either, the same core API is used. Just pass `mixed_precision="fp8"` to either the [`Accelerator`], during `accelerate config` when prompted about mixed precision, or as part of your `config.yaml` file in the `mixed_precision` key:
 
@@ -39,14 +39,16 @@ from accelerate import Accelerator
 accelerator = Accelerator(mixed_precision="fp8")
 ```
 
-By default, if `MS-AMP` is available in your environment, Accelerate will automatically utilize it as a backend. To specify it yourself (and customize other parts of the FP8 mixed precision setup), you can utilize the [`utils.FP8RecipeKwargs`] or clarify it in your config `yaml`/during `accelerate launch`:
+By default, if `MS-AMP` is available in your environment, Accelerate will automatically utilize it as a backend. To specify it yourself (and customize other parts of the FP8 mixed precision setup), you can utilize one of the `RecipeKwargs` dataclasses such as [`utils.AORecipeKwargs`], [`utils.TERecipeKwargs`], or [`utils.MSAMPRecipeKwargs`]; you can also nclarify it in your config `yaml`/during `accelerate launch`:
 
 ```{python}
 from accelerate import Accelerator
-from accelerate.utils import FP8RecipeKwargs
-kwargs = [FP8RecipeKwargs(backend="msamp")]
+from accelerate.utils import MSAMPRecipeKwargs
+kwargs = [MSAMPRecipeKwargs()]
 # Or to specify the backend as `TransformersEngine` even if MS-AMP is installed
-# kwargs = [FP8RecipeKwargs(backend="te")]
+# kwargs = [TERecipeKwargs()]
+# Or to use torchao
+# kwargs = [AORecipeKwargs()]
 accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)
 ```
 
@@ -124,6 +126,22 @@ fp8_config:
     use_autocast_during_eval: false
 ```
 
+## Configuring `torchao`
+
+`torchao` is a [PyTorch-driven](https://github.com/pytorch/ao/tree/main/torchao/float8) hackable FP8 backend, aiming to be more approchable than the prior two engines. One of the core differences with `ao` compared to the prior two is that for numerical stability, it's found to be generally better off keeping the first *and* last layers in the model at the regular precision (be it FP32 or BF16), and then the other layers quantized down to FP8. As a result, a config for `ao` looks a bit differently:
+
+> Note: this API is experimental and is subject to change
+
+```{python}
+from accelerate import Accelerator
+from accelerate.utils import AORecipeKwargs
+kwargs = [AORecipeKwargs()]
+accelerator = Accelerator(mixed_precision="fp8", kwarg_handlers=kwargs)
+```
+
+To learn more about the specific parameters to be used, please see the official `torchao` repo.
+
+
 ## Example Zoo
 
 We have examples showcasing training with FP8 both with accelerate and its underlying implementation available in the accelerate repo.
@@ -143,3 +161,4 @@ To learn more about training in FP8 please check out the following resources:
 * [Our concept guide](../concept_guides/low_precision_training) detailing into more about both TransformersEngine and MS-AMP
 * [The `transformers-engine` documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html)
 * [The `MS-AMP` documentation](https://azure.github.io/MS-AMP/docs/)
+* [The `torchao` documentation](https://github.com/pytorch/ao/tree/main/torchao/float8)
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index a483f0d1a39..7d05dafb600 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -33,6 +33,8 @@
 import torch.utils.hooks as hooks
 from huggingface_hub import split_torch_state_dict_into_shards
 
+from accelerate.utils.imports import is_torchao_available
+
 from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
 from .data_loader import DataLoaderDispatcher, prepare_data_loader, skip_first_batches
 from .logging import get_logger
@@ -48,6 +50,7 @@
     WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
     WEIGHTS_PATTERN_NAME,
+    AORecipeKwargs,
     AutocastKwargs,
     DataLoaderConfiguration,
     DeepSpeedPlugin,
@@ -62,10 +65,12 @@
     KwargsHandler,
     LoggerType,
     MegatronLMPlugin,
+    MSAMPRecipeKwargs,
     PrecisionType,
     ProfileKwargs,
     ProjectConfiguration,
     RNGType,
+    TERecipeKwargs,
     TorchDynamoPlugin,
     TorchTensorParallelPlugin,
     apply_fp8_autowrap,
@@ -73,6 +78,7 @@
     clean_state_dict_for_safetensors,
     compare_versions,
     convert_model,
+    convert_model_to_fp8_ao,
     convert_outputs_to_fp32,
     ensure_weights_retied,
     extract_model_from_parallel,
@@ -409,45 +415,39 @@ def __init__(
         self.scaler_handler = None
         self.init_handler = None
         self.fp8_recipe_handler = None
+        self.ao_recipe_handler = None
+        self.te_recipe_handler = None
+        self.msamp_recipe_handler = None
         self.autocast_handler = None
         self.profile_handler = None
         self.has_lomo_optimizer = False
 
+        found_handlers = set()
+        handler_class_to_attr = {
+            DistributedDataParallelKwargs: "ddp_handler",
+            GradScalerKwargs: "scaler_handler",
+            InitProcessGroupKwargs: "init_handler",
+            FP8RecipeKwargs: "fp8_recipe_handler",
+            AutocastKwargs: "autocast_handler",
+            ProfileKwargs: "profile_handler",
+            AORecipeKwargs: "ao_recipe_handler",
+            TERecipeKwargs: "te_recipe_handler",
+            MSAMPRecipeKwargs: "msamp_recipe_handler",
+        }
+        self.has_fp8_handler = False
         if kwargs_handlers is not None:
             for handler in kwargs_handlers:
                 assert isinstance(
                     handler, KwargsHandler
                 ), f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`."
-                if isinstance(handler, DistributedDataParallelKwargs):
-                    if self.ddp_handler is not None:
-                        raise ValueError("You can only pass one `DistributedDataParallelKwargs` in `kwargs_handler`.")
-                    else:
-                        self.ddp_handler = handler
-                elif isinstance(handler, GradScalerKwargs):
-                    if self.scaler_handler is not None:
-                        raise ValueError("You can only pass one `GradScalerKwargs` in `kwargs_handler`.")
-                    else:
-                        self.scaler_handler = handler
-                elif isinstance(handler, InitProcessGroupKwargs):
-                    if self.init_handler is not None:
-                        raise ValueError("You can only pass one `InitProcessGroupKwargs` in `kwargs_handler`.")
-                    else:
-                        self.init_handler = handler
-                elif isinstance(handler, FP8RecipeKwargs):
-                    if self.fp8_recipe_handler is not None:
-                        raise ValueError("You can only pass one `FP8RecipeKwargs` in `kwargs_handler`.")
-                    else:
-                        self.fp8_recipe_handler = handler
-                elif isinstance(handler, AutocastKwargs):
-                    if self.autocast_handler is not None:
-                        raise ValueError("You can only pass one `AutocastKwargs` in `kwargs_handler`.")
-                    else:
-                        self.autocast_handler = handler
-                elif isinstance(handler, ProfileKwargs):
-                    if self.profile_handler is not None:
-                        raise ValueError("You can only pass one `ProfileKwargs` in `kwargs_handler`.")
-                    else:
-                        self.profile_handler = handler
+                # Add the handler class to the set of found handlers
+                if handler.__class__ in found_handlers:
+                    raise ValueError(f"You can only pass one {handler.__class__} in `kwargs_handlers`.")
+                found_handlers.add(handler.__class__)
+                handler_attr = handler_class_to_attr[handler.__class__]
+                setattr(self, handler_attr, handler)
+                if "recipe_handler" in handler_attr and not self.has_fp8_handler:
+                    self.has_fp8_handler = True
 
         kwargs = self.init_handler.to_kwargs() if self.init_handler is not None else {}
         self.state = AcceleratorState(
@@ -463,17 +463,32 @@ def __init__(
         )
 
         self._mixed_precision = mixed_precision
-        if mixed_precision == "fp8" and self.fp8_recipe_handler is None:
-            self.fp8_recipe_handler = FP8RecipeKwargs()
+        # Check for automatic FP8 recipe creation
+        if self._mixed_precision == "fp8" and not self.has_fp8_handler:
+            # Prioritize TE -> AO -> MSAMP
+            if is_torchao_available():
+                logger.info("Found `torchao` installed, using it for FP8 training.")
+                self.ao_recipe_handler = AORecipeKwargs()
+            elif is_transformer_engine_available():
+                logger.info("Found `transformer-engine` installed, using it for FP8 training.")
+                self.te_recipe_handler = TERecipeKwargs()
+            elif is_msamp_available():
+                logger.info("Found `msamp` installed, using it for FP8 training.")
+                self.msamp_recipe_handler = MSAMPRecipeKwargs()
+            else:
+                raise ImportError(
+                    "Tried to train with `fp8` and auto-detect backend, but no FP8-compatible backend was installed. "
+                    "Valid backends are: `torchao`, `transformer-engine`, and `msamp`."
+                )
 
         self.delayed_fp8_autocast = False
-        if self.fp8_recipe_handler is not None:
+        if self.has_fp8_handler:
             # We already check if FP8 is available during `self.state`
             if mixed_precision != "fp8" and (
                 self.distributed_type not in (DistributedType.FSDP, DistributedType.DEEPSPEED)
             ):
-                raise ValueError("Passing in a `FP8RecipeKwargs` object requires setting `mixed_precision='fp8'`.")
-            self.delayed_fp8_autocast = self.fp8_recipe_handler.backend == "TE" and self.distributed_type in (
+                raise ValueError("Passing in an FP8 configuration requires setting `mixed_precision='fp8'`.")
+            self.delayed_fp8_autocast = self.fp8_backend == "TE" and self.distributed_type in (
                 DistributedType.MULTI_GPU,
                 DistributedType.FSDP,
             )
@@ -1362,6 +1377,8 @@ def prepare(self, *args, device_placement=None):
                 args = self._prepare_ipex_or_xpu(*args)
         if self.fp8_backend == "TE":
             args = self._prepare_te(*args)
+        elif self.fp8_backend == "AO":
+            args = self._prepare_ao(*args)
         if self.distributed_type == DistributedType.DEEPSPEED:
             result = self._prepare_deepspeed(*args)
         elif self.distributed_type == DistributedType.MEGATRON_LM:
@@ -1447,7 +1464,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
 
         # We prepare TE after, allowing for bf16 autocast to happen first
         if self.fp8_backend == "TE" and not self.delayed_fp8_autocast:
-            model = apply_fp8_autowrap(model, self.fp8_recipe_handler)
+            model = apply_fp8_autowrap(model, self.te_recipe_handler or self.fp8_recipe_handler)
 
         if (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)) and getattr(
             model, "hf_device_map", False
@@ -1651,12 +1668,26 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 model = xmp.MpModelWrapper(model).to(self.device)
         # Now we can apply the FP8 autocast
         if self.delayed_fp8_autocast:
-            model = apply_fp8_autowrap(model, self.fp8_recipe_handler)
+            model = apply_fp8_autowrap(model, self.te_recipe_handler or self.fp8_recipe_handler)
         # torch.compile should be called last and only if the model isn't already compiled.
         if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
             model = torch.compile(model, **self.state.dynamo_plugin.to_kwargs())
         return model
 
+    def _prepare_ao(self, *args):
+        if not is_torchao_available():
+            raise ImportError(
+                "`torchao` was not found on your system or is too old of a version. Please ensure that `torchao >= 0.6.1` is installed"
+            )
+        for arg in args:
+            if isinstance(arg, torch.nn.Module):
+                convert_model_to_fp8_ao(
+                    arg,
+                    config=self.ao_recipe_handler.config,
+                    module_filter_func=self.ao_recipe_handler.module_filter_func,
+                )
+        return args
+
     def _prepare_te(self, *args):
         if not is_transformer_engine_available():
             raise ImportError(
@@ -1811,7 +1842,7 @@ def _prepare_deepspeed(self, *args):
 
         if model is not None:
             # If we are using FP8, we need to apply the autowrap now
-            if getattr(self.fp8_recipe_handler, "backend", None) == "TE":
+            if self.fp8_backend == "TE":
                 model = apply_fp8_autowrap(model, self.fp8_recipe_handler)
             # if the model is an MOE, set the appropriate MOE layers as leaf Z3 modules
             deepspeed_plugin.set_moe_leaf_modules(model)
@@ -2106,7 +2137,12 @@ def _prepare_msamp(self, *args, device_placement):
                 f"You can't use multiple models ({num_models}) or optimizers {num_optimizers} with MS-AMP."
             )
         else:
-            model, optimizer = msamp.initialize(model, optimizer, opt_level=self.fp8_recipe_handler.opt_level)
+            # DEPRECATE @ 2.0
+            if self.fp8_recipe_handler is not None:
+                opt_level = self.fp8_recipe_handler.opt_level
+            else:
+                opt_level = self.msamp_recipe_handler.opt_level
+            model, optimizer = msamp.initialize(model, optimizer, opt_level=opt_level)
         for i in range(len(result)):
             if isinstance(result[i], torch.nn.Module):
                 result[i] = model
@@ -3647,8 +3683,15 @@ def lomo_backward(self, loss: torch.Tensor, learning_rate: float) -> None:
     @property
     def fp8_backend(self):
         "Returns the configured backend for training in FP8"
-        if self._mixed_precision == "fp8" and self.fp8_recipe_handler is not None:
-            return self.fp8_recipe_handler.backend
+        if self.has_fp8_handler:
+            if self.fp8_recipe_handler is not None:
+                return self.fp8_recipe_handler.backend
+            elif self.ao_recipe_handler is not None:
+                return "AO"
+            elif self.te_recipe_handler is not None:
+                return "TE"
+            elif self.msamp_recipe_handler is not None:
+                return "MSAMP"
         elif self.state.deepspeed_plugin is not None and self.state.deepspeed_plugin.enable_msamp:
             return "MSAMP"
         return None
diff --git a/src/accelerate/test_utils/__init__.py b/src/accelerate/test_utils/__init__.py
index f41473f6363..6b59fb0246d 100644
--- a/src/accelerate/test_utils/__init__.py
+++ b/src/accelerate/test_utils/__init__.py
@@ -41,6 +41,7 @@
     require_single_gpu,
     require_single_xpu,
     require_torch_min_version,
+    require_torchao,
     require_torchvision,
     require_tpu,
     require_transformer_engine,
diff --git a/src/accelerate/test_utils/testing.py b/src/accelerate/test_utils/testing.py
index bbcfc616ada..48350b0a4a7 100644
--- a/src/accelerate/test_utils/testing.py
+++ b/src/accelerate/test_utils/testing.py
@@ -53,6 +53,7 @@
     is_timm_available,
     is_torch_version,
     is_torch_xla_available,
+    is_torchao_available,
     is_torchdata_stateful_dataloader_available,
     is_torchvision_available,
     is_transformer_engine_available,
@@ -425,6 +426,13 @@ def require_transformer_engine(test_case):
     return unittest.skipUnless(is_transformer_engine_available(), "test requires transformers engine")(test_case)
 
 
+def require_torchao(test_case):
+    """
+    Decorator marking a test that requires torchao installed. These tests are skipped when torchao isn't installed
+    """
+    return unittest.skipUnless(is_torchao_available(), "test requires torchao")(test_case)
+
+
 _atleast_one_tracker_available = (
     any([is_wandb_available(), is_tensorboard_available()]) and not is_comet_ml_available()
 )
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
index e0ea5841372..6b5bed62d73 100644
--- a/src/accelerate/utils/__init__.py
+++ b/src/accelerate/utils/__init__.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .ao import convert_model_to_fp8_ao, filter_first_and_last_linear_layers, has_ao_layers
 from .constants import (
     MITA_PROFILING_AVAILABLE_PYTORCH_VERSION,
     MODEL_NAME,
@@ -32,6 +33,7 @@
     XPU_PROFILING_AVAILABLE_PYTORCH_VERSION,
 )
 from .dataclasses import (
+    AORecipeKwargs,
     AutocastKwargs,
     BnbQuantizationConfig,
     ComputeEnvironment,
@@ -50,12 +52,14 @@
     KwargsHandler,
     LoggerType,
     MegatronLMPlugin,
+    MSAMPRecipeKwargs,
     PrecisionType,
     ProfileKwargs,
     ProjectConfiguration,
     RNGType,
     SageMakerDistributedType,
     TensorInformation,
+    TERecipeKwargs,
     TorchDynamoPlugin,
     TorchTensorParallelPlugin,
     add_model_config_to_megatron_parser,
@@ -115,6 +119,7 @@
     is_tensorboard_available,
     is_timm_available,
     is_torch_xla_available,
+    is_torchao_available,
     is_torchdata_available,
     is_torchdata_stateful_dataloader_available,
     is_torchvision_available,
@@ -124,6 +129,7 @@
     is_wandb_available,
     is_weights_only_available,
     is_xpu_available,
+    torchao_required,
 )
 from .modeling import (
     align_module_device,
diff --git a/src/accelerate/utils/ao.py b/src/accelerate/utils/ao.py
new file mode 100644
index 00000000000..2023371ca66
--- /dev/null
+++ b/src/accelerate/utils/ao.py
@@ -0,0 +1,139 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Needed utilities for torchao FP8 training.
+"""
+
+from functools import partial
+from typing import Callable, List, Optional
+
+import torch
+
+from .imports import is_torchao_available, torchao_required
+
+
+if is_torchao_available():
+    from torchao.float8.float8_linear import Float8LinearConfig
+
+
+def find_first_last_linear_layers(model: torch.nn.Module):
+    """
+    Finds the first and last linear layer names in a model.
+
+    This is needed during FP8 to avoid issues with instability by keeping the first and last layers unquantized.
+
+    Ref: https://x.com/xariusrke/status/1826669142604141052
+    """
+    first_linear, last_linear = None, None
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            if first_linear is None:
+                first_linear = name
+            last_linear = name
+    return first_linear, last_linear
+
+
+def filter_linear_layers(module, fqn: str, layers_to_filter: List[str]) -> bool:
+    """
+    A function which will check if `module` is:
+    - a `torch.nn.Linear` layer
+    - has in_features and out_features divisible by 16
+    - is not part of `layers_to_filter`
+
+    Args:
+        module (`torch.nn.Module`):
+            The module to check.
+        fqn (`str`):
+            The fully qualified name of the layer.
+        layers_to_filter (`List[str]`):
+            The list of layers to filter.
+    """
+    if isinstance(module, torch.nn.Linear):
+        if module.in_features % 16 != 0 or module.out_features % 16 != 0:
+            return False
+    if fqn in layers_to_filter:
+        return False
+    return True
+
+
+def filter_first_and_last_linear_layers(module, fqn: str) -> bool:
+    """
+    A filter function which will filter out all linear layers except the first and last.
+
+    <Tip>
+
+        For stability reasons, we skip the first and last linear layers Otherwise can lead to the model not training or
+        converging properly
+
+    </Tip>
+
+    Args:
+        module (`torch.nn.Module`):
+            The module to check.
+        fqn (`str`):
+            The fully qualified name of the layer.
+    """
+    first_linear, last_linear = find_first_last_linear_layers(module)
+    return filter_linear_layers(module, fqn, layers_to_filter=[first_linear, last_linear])
+
+
+@torchao_required
+def has_ao_layers(model: torch.nn.Module):
+    from torchao.float8.float8_linear import Float8Linear
+
+    for name, module in model.named_modules():
+        if isinstance(module, Float8Linear):
+            return True
+    return False
+
+
+@torchao_required
+def convert_model_to_fp8_ao(
+    model: torch.nn.Module,
+    config: Optional["Float8LinearConfig"] = None,
+    module_filter_func: Optional[Callable] = filter_first_and_last_linear_layers,
+):
+    """
+    Converts all `nn.Linear` layers in the model (except the first and last) to torchao's `Float8Linear` layer inplace.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to convert.
+        config (`torchao.float8.Float8LinearConfig`, *optional*):
+            The configuration for the FP8 training. Recommended to utilize
+            `torchao.float8.recipe_name_to_linear_config` to generate this. In general, the default config should be
+            sufficient (what is passed when set to `None`).
+        module_filter_func (`Callable`, *optional*, defaults to `filter_linear_layers`):
+            Optional function that must take in a module and layer name, and returns a boolean indicating whether the
+            module should be converted to FP8. Defaults to `filter_linear_layers`. See it for an example.
+
+    Example:
+
+    ```python
+    from accelerate.utils.ao import convert_model_to_fp8_ao
+
+    model = MyModel()
+    model.to("cuda")
+    convert_to_float8_training(model)
+
+    model.train()
+    ```
+    """
+    from torchao.float8 import convert_to_float8_training
+
+    first_linear, last_linear = find_first_last_linear_layers(model)
+    if module_filter_func is None:
+        module_filter_func = partial(filter_linear_layers, first_layer_name=first_linear, last_layer_name=last_linear)
+    convert_to_float8_training(model, module_filter_fn=module_filter_func, config=config)
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
index 3baa525d294..9936ee8c00c 100644
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@@ -20,12 +20,13 @@
 import copy
 import enum
 import functools
+import logging
 import os
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from datetime import timedelta
-from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple, Union, get_args
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple, Union, get_args
 
 import torch
 
@@ -50,6 +51,13 @@
 from .versions import compare_versions, is_torch_version
 
 
+if TYPE_CHECKING:
+    # Mock imports for type checking
+    from torchao.float8 import Float8LinearConfig
+
+logger = logging.getLogger(__name__)
+
+
 class KwargsHandler:
     """
     Internal mixin that implements a `to_kwargs()` method for a dataclass.
@@ -281,40 +289,48 @@ def __post_init__(self):
 AmaxComputeAlgorithm = Literal["max", "most_recent"]
 
 
+# FP8 training recipe kwargs
+@dataclass
+class AORecipeKwargs(KwargsHandler):
+    """
+    Use this object in your [`Accelerator`] to customize the initialization of the recipe for FP8 mixed precision
+    training with `torchao` FP8.
+
+    Args:
+        config (`torchao.float8.Float8LinearConfig`, *optional*, default to `None`):
+            The configuration for the FP8 training. In general, the default config should be sufficient.
+        module_filter_func (`Callable`, *optional*, default to `None`):
+            Optional function that must take in a module and layer name, and returns a boolean indicating whether the
+            module should be converted to FP8. Defaults to `accelerate.utils.ao.filter_linear_layers`. See it for an
+            example.
+    """
+
+    config: Optional["Float8LinearConfig"] = None
+    module_filter_func: Optional[Callable] = None
+
+
 @dataclass
-class FP8RecipeKwargs(KwargsHandler):
+class TERecipeKwargs(KwargsHandler):
     """
     Use this object in your [`Accelerator`] to customize the initialization of the recipe for FP8 mixed precision
-    training with `transformer-engine` or `ms-amp`.
+    training with `transformer-engine`.
 
     <Tip>
 
-        For more information on `transformer-engine` args, please refer to the API
+        For more information on the args, please refer to the API
         [documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html).
 
-        For more information on the `ms-amp` args, please refer to the Optimization Level
-        [documentation](https://azure.github.io/MS-AMP/docs/user-tutorial/optimization-level).
-
     </Tip>
 
     ```python
     from accelerate import Accelerator
-    from accelerate.utils import FP8RecipeKwargs
+    from accelerate.utils import TERecipeKwargs
 
-    kwargs = FP8RecipeKwargs(backend="te", fp8_format="HYBRID")
+    kwargs = TERecipeKwargs(fp8_format="HYBRID")
     accelerator = Accelerator(mixed_precision="fp8", kwargs_handlers=[kwargs])
     ```
 
-    To use MS-AMP as an engine, pass `backend="msamp"` and the `optimization_level`:
-
-    ```python
-    kwargs = FP8RecipeKwargs(backend="msamp", optimization_level="02")
-    ```
-
     Args:
-        backend (`str`, *optional*):
-            Which FP8 engine to use. Must be one of `"msamp"` (MS-AMP) or `"te"` (TransformerEngine). If not passed,
-            will use whichever is available in the environment, prioritizing MS-AMP.
         use_autocast_during_eval (`bool`, *optional*, default to `False`):
             Whether to use FP8 autocast during eval mode. Generally better metrics are found when this is `False`.
         margin (`int`, *optional*, default to 0):
@@ -330,21 +346,9 @@ class FP8RecipeKwargs(KwargsHandler):
             The algorithm to use for the scaling factor computation. Must be one of `max` or `most_recent`.
         override_linear_precision (`tuple` of three `bool`, *optional*, default to `(False, False, False)`):
             Whether or not to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision.
-        optimization_level (`str`), one of `O1`, `O2`. (default is `O2`):
-            What level of 8-bit collective communication should be used with MS-AMP. In general:
-                * O1: Weight gradients and `all_reduce` communications are done in fp8, reducing GPU
-                    memory usage and communication bandwidth
-                * O2: First-order optimizer states are in 8-bit, and second order states are in FP16.
-                    Only available when using Adam or AdamW. This maintains accuracy and can potentially save the
-                    highest memory.
-                * 03: Specifically for DeepSpeed, implements capabilities so weights and master weights of models
-                    are stored in FP8. If `fp8` is selected and deepspeed is enabled, will be used by default. (Not
-                    available currently).
     """
 
-    backend: Backend = None
     use_autocast_during_eval: bool = None
-    opt_level: OptLevel = None
     margin: int = None
     interval: int = None
     fp8_format: FP8Format = None
@@ -354,50 +358,73 @@ class FP8RecipeKwargs(KwargsHandler):
 
     def __post_init__(self):
         env_prefix = "ACCELERATE_FP8_"
+        if not is_transformer_engine_available():
+            raise ImportError("TransformerEngine is not available. Please install it or use a different backend.")
+        if self.use_autocast_during_eval is None:
+            self.use_autocast_during_eval = parse_flag_from_env(env_prefix + "USE_AUTOCAST_DURING_EVAL")
+        if self.margin is None:
+            self.margin = int(os.environ.get(env_prefix + "MARGIN", 0))
+        if self.interval is None:
+            self.interval = int(os.environ.get(env_prefix + "INTERVAL", 1))
+        if self.fp8_format is None:
+            self.fp8_format = os.environ.get(env_prefix + "FORMAT", "HYBRID")
+        self.fp8_format = self.fp8_format.upper()
+        if self.fp8_format not in get_args(FP8Format):
+            raise ValueError(f"`fp8_format` must be one of {' or '.join(get_args(FP8Format))}.")
+        if self.amax_compute_algo is None:
+            self.amax_compute_algo = os.environ.get(env_prefix + "AMAX_COMPUTE_ALGO", "most_recent")
+        self.amax_compute_algo = self.amax_compute_algo.lower()
+        if self.amax_compute_algo not in get_args(AmaxComputeAlgorithm):
+            raise ValueError(f"`amax_compute_algo` must be one of {' or '.join(get_args(AmaxComputeAlgorithm))}")
+        if self.amax_history_len is None:
+            self.amax_history_len = int(os.environ.get(env_prefix + "AMAX_HISTORY_LEN", 1024))
+        if self.override_linear_precision is None:
+            fprop = parse_flag_from_env(env_prefix + "OVERRIDE_FPROP")
+            dgrad = parse_flag_from_env(env_prefix + "OVERRIDE_DGRAD")
+            wgrad = parse_flag_from_env(env_prefix + "OVERRIDE_WGRAD")
+            self.override_linear_precision = (fprop, dgrad, wgrad)
+
+
+@dataclass
+class MSAMPRecipeKwargs(KwargsHandler):
+    """
+    Use this object in your [`Accelerator`] to customize the initialization of the recipe for FP8 mixed precision
+    training with `ms-amp`.
+    """
+
+    opt_level: OptLevel = None
+
+    def __post_init__(self):
+        env_prefix = "ACCELERATE_FP8_"
+        if self.opt_level is None:
+            self.opt_level = os.environ.get(env_prefix + "OPT_LEVEL", "O2")
+        if self.opt_level not in get_args(OptLevel):
+            raise ValueError(f"`opt_level` must be one of {' or '.join(get_args(OptLevel))}")
+
+
+@dataclass
+class FP8RecipeKwargs(TERecipeKwargs, MSAMPRecipeKwargs):
+    """
+    Deprecated. Please use one of the proper FP8 recipe kwargs classes such as `TERecipeKwargs` or `MSAMPRecipeKwargs`
+    instead.
+    """
+
+    backend: Backend = None
+
+    def __post_init__(self):
+        env_prefix = "ACCELERATE_FP8_"
+        warnings.warn(
+            "FP8RecipeKwargs is deprecated and will be removed in Accelerate v2.0.0. "
+            "Please use one of the proper FP8 recipe kwargs classes such as TERecipeKwargs or MSAMPRecipeKwargs instead.",
+            FutureWarning,
+        )
         default_backend = "msamp" if is_msamp_available() else "te"
         if self.backend is None:
             self.backend = os.environ.get(env_prefix + "BACKEND", default_backend)
         self.backend = self.backend.upper()
         if self.backend not in get_args(Backend):
-            raise ValueError("`backend` must be 'MSAMP' or 'TE' (TransformerEngine).")
-        # Check TE args
-        if self.backend == "TE":
-            if not is_transformer_engine_available():
-                raise ValueError(
-                    "TransformerEngine is not available. Please either install it, or use the 'MSAMP' backend (if installed)."
-                )
-            if self.use_autocast_during_eval is None:
-                self.use_autocast_during_eval = parse_flag_from_env(env_prefix + "USE_AUTOCAST_DURING_EVAL")
-            if self.margin is None:
-                self.margin = int(os.environ.get(env_prefix + "MARGIN", 0))
-            if self.interval is None:
-                self.interval = int(os.environ.get(env_prefix + "INTERVAL", 1))
-            if self.fp8_format is None:
-                self.fp8_format = os.environ.get(env_prefix + "FORMAT", "HYBRID")
-            self.fp8_format = self.fp8_format.upper()
-            if self.fp8_format not in get_args(FP8Format):
-                raise ValueError(f"`fp8_format` must be one of {' or '.join(get_args(FP8Format))}.")
-            if self.amax_compute_algo is None:
-                self.amax_compute_algo = os.environ.get(env_prefix + "AMAX_COMPUTE_ALGO", "most_recent")
-            self.amax_compute_algo = self.amax_compute_algo.lower()
-            if self.amax_compute_algo not in get_args(AmaxComputeAlgorithm):
-                raise ValueError(f"`amax_compute_algo` must be one of {' or '.join(get_args(AmaxComputeAlgorithm))}")
-            if self.amax_history_len is None:
-                self.amax_history_len = int(os.environ.get(env_prefix + "AMAX_HISTORY_LEN", 1024))
-            if self.override_linear_precision is None:
-                fprop = parse_flag_from_env(env_prefix + "OVERRIDE_FPROP")
-                dgrad = parse_flag_from_env(env_prefix + "OVERRIDE_DGRAD")
-                wgrad = parse_flag_from_env(env_prefix + "OVERRIDE_WGRAD")
-                self.override_linear_precision = (fprop, dgrad, wgrad)
-        elif self.backend == "MSAMP":
-            if not is_msamp_available():
-                raise ValueError(
-                    "MS-AMP is not available. Please either install it, or use the 'TE' backend (if installed)."
-                )
-            if self.opt_level is None:
-                self.opt_level = os.environ.get(env_prefix + "OPT_LEVEL", "O2")
-            if self.opt_level not in get_args(OptLevel):
-                raise ValueError(f"`optimization_level` must be one of {' or '.join(get_args(OptLevel))}")
+            raise ValueError("`backend` must be 'MSAMP' or 'TE' (TransformerEngine) to use `FP8RecipeKwargs`.")
+        super().__post_init__()
 
 
 # Literal
diff --git a/src/accelerate/utils/imports.py b/src/accelerate/utils/imports.py
index b271dab9a9a..c103b41f737 100644
--- a/src/accelerate/utils/imports.py
+++ b/src/accelerate/utils/imports.py
@@ -110,7 +110,7 @@ def is_lomo_available():
 
 
 def is_fp8_available():
-    return is_msamp_available() or is_transformer_engine_available()
+    return is_msamp_available() or is_transformer_engine_available() or is_torchao_available()
 
 
 def is_cuda_available():
@@ -142,6 +142,14 @@ def is_torch_xla_available(check_is_tpu=False, check_is_gpu=False):
     return True
 
 
+def is_torchao_available():
+    package_exists = _is_package_available("torchao")
+    if package_exists:
+        torchao_version = version.parse(importlib.metadata.version("torchao"))
+        return compare_versions(torchao_version, ">=", "0.6.1")
+    return False
+
+
 def is_deepspeed_available():
     if is_mlu_available():
         return _is_package_available("deepspeed", metadata_name="deepspeed-mlu")
@@ -422,6 +430,22 @@ def is_torchdata_stateful_dataloader_available():
     return False
 
 
+def torchao_required(func):
+    """
+    A decorator that ensures the decorated function is only called when torchao is available.
+    """
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not is_torchao_available():
+            raise ImportError(
+                "`torchao` is not available, please install it before calling this function via `pip install torchao`."
+            )
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
 # TODO: Rework this into `utils.deepspeed` and migrate the "core" chunks into `accelerate.deepspeed`
 def deepspeed_required(func):
     """
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000000..8568c82be1c
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/test_fp8.py b/tests/test_fp8.py
index eb35f183b6a..7e3814c35f2 100644
--- a/tests/test_fp8.py
+++ b/tests/test_fp8.py
@@ -20,13 +20,26 @@
 
 from accelerate import Accelerator
 from accelerate.state import AcceleratorState
-from accelerate.test_utils import get_launch_command, require_cuda, require_multi_gpu, require_transformer_engine
+from accelerate.test_utils import (
+    get_launch_command,
+    require_cuda,
+    require_huggingface_suite,
+    require_multi_gpu,
+    require_torchao,
+    require_transformer_engine,
+)
 from accelerate.test_utils.testing import require_deepspeed, run_command
-from accelerate.utils import FP8RecipeKwargs, has_transformer_engine_layers
+from accelerate.utils import (
+    AORecipeKwargs,
+    FP8RecipeKwargs,
+    has_ao_layers,
+    has_transformer_engine_layers,
+    is_torchao_available,
+    is_transformer_engine_available,
+)
 
 
-def can_convert_model():
-    print("Starting basic_fp8_test")
+def can_convert_te_model():
     accelerator_kwargs = {"mixed_precision": "fp8", "kwargs_handlers": [FP8RecipeKwargs(backend="TE")]}
     accelerator = Accelerator(**accelerator_kwargs)
     dataloader = torch.utils.data.DataLoader(torch.randn(10, 32), batch_size=2)
@@ -44,6 +57,20 @@ def maintain_proper_deepspeed_config(expected_version):
     ), f"Expected zero stage {expected_version} but got {AcceleratorState().deepspeed_plugin.zero_stage}"
 
 
+def can_convert_ao_model():
+    from transformers import AutoModelForSequenceClassification
+
+    accelerator_kwargs = {"mixed_precision": "fp8", "kwargs_handlers": [AORecipeKwargs()]}
+    accelerator = Accelerator(**accelerator_kwargs)
+    dataloader = torch.utils.data.DataLoader(torch.randn(10, 32), batch_size=2)
+    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
+
+    model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
+    assert has_ao_layers(model)
+
+
 @require_transformer_engine
 class TestTransformerEngine(unittest.TestCase):
     @require_cuda
@@ -91,7 +118,60 @@ def test_can_prepare_model_multigpu_deepspeed(self):
             run_command(command)
 
 
+@require_torchao
+@require_huggingface_suite
+class TestTorchAO(unittest.TestCase):
+    @require_cuda
+    def test_can_prepare_model_single_gpu(self):
+        command = get_launch_command(num_processes=1, monitor_interval=0.1)
+        command += ["-m", "tests.test_fp8"]
+        run_command(command)
+
+    @require_multi_gpu
+    def test_can_prepare_model_multi_gpu(self):
+        command = get_launch_command(num_processes=2, monitor_interval=0.1)
+        command += ["-m", "tests.test_fp8"]
+        run_command(command)
+
+    @require_deepspeed
+    @require_multi_gpu
+    def test_can_prepare_model_multigpu_deepspeed(self):
+        for zero_stage in [1, 2, 3]:
+            os.environ["ZERO_STAGE"] = str(zero_stage)
+            ds_config = {
+                "bf16": {"enabled": True},
+                "zero_optimization": {
+                    "stage": zero_stage,
+                    "allgather_partitions": True,
+                    "allgather_bucket_size": 2e8,
+                    "overlap_comm": True,
+                    "reduce_scatter": True,
+                    "reduce_bucket_size": 2e8,
+                    "contiguous_gradients": True,
+                },
+                "gradient_accumulation_steps": 1,
+                "gradient_clipping": "auto",
+                "steps_per_print": 2000,
+                "train_batch_size": "auto",
+                "train_micro_batch_size_per_gpu": "auto",
+                "wall_clock_breakdown": False,
+            }
+
+            ds_config = json.dumps(ds_config)
+
+            command = get_launch_command(
+                num_processes=2, monitor_interval=0.1, use_deepspeed=True, deepspeed_config_file=ds_config
+            )
+            command += ["-m", "tests.test_fp8"]
+            run_command(command)
+
+
 if __name__ == "__main__":
-    can_convert_model()
-    if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
-        maintain_proper_deepspeed_config(int(os.environ.get("ZERO_STAGE")))
+    # TE suite
+    if is_transformer_engine_available():
+        can_convert_te_model()
+        if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
+            maintain_proper_deepspeed_config(int(os.environ.get("ZERO_STAGE")))
+    # AO suite
+    if is_torchao_available():
+        can_convert_ao_model()