From cbc4f85e34fc8016c77c87b085a6141db7fa5830 Mon Sep 17 00:00:00 2001
From: Austin Liu <austin362667@gmail.com>
Date: Fri, 30 Aug 2024 07:58:07 +0800
Subject: [PATCH] Skip Tests for GPUs Not Supporting `bf16` (#159)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Closes https://github.com/linkedin/Liger-Kernel/issues/87

Skipped tests for `bfloat16` on GPUs with compute capability below
Ampere architecture (`sm_80`).

<!--- This is a required section; please describe the main purpose of
this proposed code change. --->

<!---
## Details
This is an optional section; is there anything specific that reviewers
should be aware of?
--->

## Testing Done
<!--- This is a required section; please describe how this change was
tested. --->

<!--
Replace BLANK with your device type. For example, A100-80G-PCIe

Complete the following tasks before sending your PR, and replace `[ ]`
with
`[x]` to indicate you have done them.
-->

- Hardware Type: NVIDIA **T4** (should skip most cases)
- [X] run `make test` to ensure correctness
- [X] run `make checkstyle` to ensure code style
- [X] run `make test-convergence` to ensure convergence

```
⚡ main ~/Liger-Kernel make all
python -m pytest --disable-warnings test/ --ignore=test/convergence
HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence
flake8 .; flake8_status=$?; \
isort .; isort_status=$?; \
black .; black_status=$?; \
if [ $flake8_status -ne 0 ] || [ $isort_status -ne 0 ] || [ $black_status -ne 0 ]; then \
        exit 1; \
fi
=================================================================== test session starts ====================================================================
platform linux -- Python 3.10.10, pytest-8.3.2, pluggy-1.5.0
rootdir: /teamspace/studios/this_studio/Liger-Kernel
plugins: anyio-4.4.0
collecting ... =================================================================== test session starts ====================================================================
platform linux -- Python 3.10.10, pytest-8.3.2, pluggy-1.5.0
rootdir: /teamspace/studios/this_studio/Liger-Kernel
plugins: anyio-4.4.0
collecting ... Skipped 1 files
All done! ✨ 🍰 ✨
58 files left unchanged.
collected 163 items

test/transformers/test_auto_model.py .                                                                                                               [  0%]
test/transformers/test_cross_entropy.py ssssssssssssssssssssssssssssssssssssssssssssssssssssssssss                                                   [ 36%]
collected 28 items

test/convergence/test_mini_models.py .....s.....s....                                                                                    [ 43%]
test/transformers/test_geglu.py .s....ssss                                                                                                             [ 48%]
test/transformers/test_monkey_patch.py .....                                                                                                         [ 51%]
test/transformers/test_rms_norm.py ........ssssssss...............ssssssss........                                                                  [ 80%]
test/transformers/test_rope.py ......ssssss                                                                                                          [ 88%]
test/transformers/test_swiglu.py ....ssss.s....ssss                                                                                                    [ 98%]
test/transformers/test_trainer_integration.py .                                                                                                      [ 98%]
test/triton/test_triton_monkey_patch.py ..                                                                                                           [100%]

======================================================== 71 passed, 92 skipped in 136.69s (0:02:16) ========================================================
.s.s.s                                                                                                  [ 50%]
test/convergence/test_mini_models_no_logits.py .s.s.s.s.s.s.s                                                                                        [100%]

======================================================== 14 passed, 14 skipped in 353.27s (0:05:53) ========================================================
```

- Hardware Type: NVIDIA **L4** (should skip few cases)
- [X] run `make test` to ensure correctness
- [X] run `make checkstyle` to ensure code style
- [X] run `make test-convergence` to ensure convergence

```
⚡ main ~/Liger-Kernel make all
python -m pytest --disable-warnings test/ --ignore=test/convergence
HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence
flake8 .; flake8_status=$?; \
isort .; isort_status=$?; \
black .; black_status=$?; \
if [ $flake8_status -ne 0 ] || [ $isort_status -ne 0 ] || [ $black_status -ne 0 ]; then \
        exit 1; \
fi
=================================================================== test session starts ====================================================================
platform linux -- Python 3.10.10, pytest-8.3.2, pluggy-1.5.0
rootdir: /teamspace/studios/this_studio/Liger-Kernel
plugins: anyio-4.4.0
collecting ... =================================================================== test session starts ====================================================================
platform linux -- Python 3.10.10, pytest-8.3.2, pluggy-1.5.0
rootdir: /teamspace/studios/this_studio/Liger-Kernel
plugins: anyio-4.4.0
collecting ... Skipped 1 files
All done! ✨ 🍰 ✨
58 files left unchanged.
collected 163 items

test/transformers/test_auto_model.py .                                                                                                               [  0%]
collected 28 items

test/convergence/test_mini_models.py ........................................................ss                                                   [ 36%]
test/transformers/test_fused_linear_cross_entropy.py ...............                                                                                    [ 43%]
test/transformers/test_geglu.py .........                                                                                                             [ 48%]
test/transformers/test_monkey_patch.py .....                                                                                                         [ 51%]
test/transformers/test_rms_norm.py .................................................                                                                  [ 80%]
test/transformers/test_rope.py ............                                                                                                          [ 88%]
test/transformers/test_swiglu.py ..................                                                                                                    [ 98%]
test/transformers/test_trainer_integration.py .                                                                                                      [ 98%]
test/triton/test_triton_monkey_patch.py ..                                                                                                           [100%]

======================================================== 161 passed, 2 skipped in 90.45s (0:01:30) =========================================================
.......                                                                                                  [ 50%]
test/convergence/test_mini_models_no_logits.py ..............                                                                                        [100%]

============================================================== 28 passed in 290.65s (0:04:50) ==============================================================
```

##  Additional Context
FYR, here’s a list of NVIDIA architecture names, and which compute
capabilities they have:

<img width="1268" alt="Screenshot 2024-08-29 at 6 04 56 PM"
src="https://github.com/user-attachments/assets/6675ae9e-9137-4adb-8af7-ee1226733353">

---------

Signed-off-by: Austin Liu <austin362667@gmail.com>
Co-authored-by: Shao Tang <tangshao28@gmail.com>
---
 test/convergence/test_mini_models.py          | 113 ++++++++++++++++--
 .../convergence/test_mini_models_no_logits.py | 113 ++++++++++++++++--
 test/transformers/test_cross_entropy.py       |  84 +++++++++++--
 test/transformers/test_geglu.py               |  11 +-
 test/transformers/test_rms_norm.py            |  11 +-
 test/transformers/test_rope.py                |  11 +-
 test/transformers/test_swiglu.py              |  20 +++-
 test/utils.py                                 |   6 +
 8 files changed, 342 insertions(+), 27 deletions(-)

diff --git a/test/convergence/test_mini_models.py b/test/convergence/test_mini_models.py
index 4bc6293fa..95c832e15 100644
--- a/test/convergence/test_mini_models.py
+++ b/test/convergence/test_mini_models.py
@@ -6,6 +6,7 @@
     assert_verbose_allclose,
     set_seed,
     simple_collate_fn,
+    supports_bfloat16,
 )
 
 import pytest
@@ -344,23 +345,121 @@ def run_mini_model(
     [
         # Gemma 1.1 and 2 has more tolerance because currently, the kernel is not a perfect match (casts are not done the same way)
         ("mini_gemma1", 32, 1e-4, torch.float32, 1e-8, 6e-4, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_gemma1", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_gemma1",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-8,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         ("mini_gemma1.1", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_gemma1.1", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_gemma1.1",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-8,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         ("mini_gemma2", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_gemma2", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_gemma2",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-8,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_llama3", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_llama3",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-8,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         # TODO: torch 2.5.0 nightly breaks mixtral test, but torch 2.3.0 works fine
         # TODO: mixtral MoE structure makes the convergence flaky so disable the test for now. It needs high tol to pass.
         # ("mini_mixtral", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 8e-3, 1e-5),
         # ("mini_mixtral", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 2.0, 1e-5, 1e-2, 1e-5),
         ("mini_mistral", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_mistral", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_mistral",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-8,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         ("mini_qwen2", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_qwen2", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_qwen2",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-8,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         ("mini_phi3", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_phi3", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_phi3",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-8,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
     ],
 )
 def test_mini_model(
diff --git a/test/convergence/test_mini_models_no_logits.py b/test/convergence/test_mini_models_no_logits.py
index 7f99cf210..3a3272f8d 100644
--- a/test/convergence/test_mini_models_no_logits.py
+++ b/test/convergence/test_mini_models_no_logits.py
@@ -4,6 +4,7 @@
     assert_verbose_allclose,
     set_seed,
     simple_collate_fn,
+    supports_bfloat16,
 )
 
 import pytest
@@ -291,20 +292,118 @@ def run_mini_model(
     "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logits_atol, logits_rtol, param_atol, param_rtol",
     [
         ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 2e-5, 1e-4, 1e-5, 5e-3, 1e-5),
-        ("mini_llama3", 32, 1e-4, torch.bfloat16, 5e-3, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_llama3",
+            32,
+            1e-4,
+            torch.bfloat16,
+            5e-3,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         ("mini_qwen2", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_qwen2", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_qwen2",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-8,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         ("mini_phi3", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_phi3", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_phi3",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-8,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         ("mini_mistral", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_mistral", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-2, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_mistral",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-8,
+            1e-5,
+            1e-2,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         # Gemma 1.1 and 2 has more tolerance because currently, the kernel is not a perfect match (casts are not done the same way)
         ("mini_gemma1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_gemma1", 32, 1e-4, torch.bfloat16, 1e-2, 1e-4, 2e-1, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_gemma1",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-2,
+            1e-4,
+            2e-1,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         ("mini_gemma1.1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_gemma1.1", 32, 1e-4, torch.bfloat16, 1e-2, 1e-4, 2e-1, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_gemma1.1",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-2,
+            1e-4,
+            2e-1,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         ("mini_gemma2", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_gemma2", 32, 1e-4, torch.bfloat16, 1e-2, 1e-4, 2e-1, 1e-5, 1e-2, 1e-5),
+        pytest.param(
+            "mini_gemma2",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-2,
+            1e-4,
+            2e-1,
+            1e-5,
+            1e-2,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
     ],
 )
 def test_mini_model(
diff --git a/test/transformers/test_cross_entropy.py b/test/transformers/test_cross_entropy.py
index bd3c4592f..29b56cdc5 100644
--- a/test/transformers/test_cross_entropy.py
+++ b/test/transformers/test_cross_entropy.py
@@ -1,3 +1,5 @@
+from test.utils import supports_bfloat16
+
 import pytest
 import torch
 from torch.nn import CrossEntropyLoss
@@ -99,14 +101,42 @@ def _test_correctness_not_last_layer_once(
 @pytest.mark.parametrize(
     "scalar, dtype, atol, rtol",
     [
-        (0.1, torch.bfloat16, 1e-8, 5e-2),
-        (1.0, torch.bfloat16, 1e-8, 5e-2),
-        (10.0, torch.bfloat16, 1e-7, 5e-2),
+        pytest.param(
+            0.1,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
+        pytest.param(
+            10.0,
+            torch.bfloat16,
+            1e-7,
+            5e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         (0.1, torch.float32, 1e-8, 1e-6),
         (1.0, torch.float32, 1e-8, 1e-6),
         (10.0, torch.float32, 1e-8, 1e-6),
     ],
 )
+@pytest.mark.skipif(
+    torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000,
+    reason="Needs 16GB+ GPU memory.",
+)
 def test_correctness(B, T, V, scalar, dtype, atol, rtol):
     liger_ce = LigerCrossEntropyLoss()
     _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)
@@ -125,14 +155,42 @@ def test_correctness(B, T, V, scalar, dtype, atol, rtol):
 @pytest.mark.parametrize(
     "scalar, dtype, atol, rtol",
     [
-        (0.1, torch.bfloat16, 1e-8, 5e-2),
-        (1.0, torch.bfloat16, 1e-8, 5e-2),
-        (10.0, torch.bfloat16, 1e-8, 5e-2),
+        pytest.param(
+            0.1,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
+        pytest.param(
+            10.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         (0.1, torch.float32, 1e-8, 1e-6),
         (1.0, torch.float32, 1e-8, 1e-6),
         (10.0, torch.float32, 1e-8, 1e-6),
     ],
 )
+@pytest.mark.skipif(
+    torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000,
+    reason="Needs 16GB+ GPU memory.",
+)
 def test_correctness_with_ignore_index(
     B, T, V, ignore_index, scalar, dtype, atol, rtol
 ):
@@ -155,10 +213,22 @@ def test_correctness_with_ignore_index(
 @pytest.mark.parametrize(
     "scalar, dtype, atol, rtol",
     [
-        (1.0, torch.bfloat16, 1e-8, 5e-2),
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         (1.0, torch.float32, 1e-8, 1e-6),
     ],
 )
+@pytest.mark.skipif(
+    torch.cuda.get_device_properties(0).total_memory < 16 * 1000 * 1000 * 1000,
+    reason="Needs 16GB+ GPU memory.",
+)
 def test_correctness_not_last_layer(B, T, V, scalar, dtype, atol, rtol):
     liger_ce = LigerCrossEntropyLoss()
     _test_correctness_not_last_layer_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)
diff --git a/test/transformers/test_geglu.py b/test/transformers/test_geglu.py
index b06fa04bf..225946414 100644
--- a/test/transformers/test_geglu.py
+++ b/test/transformers/test_geglu.py
@@ -1,3 +1,5 @@
+from test.utils import supports_bfloat16
+
 import pytest
 import torch
 from transformers.models.llama.configuration_llama import LlamaConfig
@@ -29,7 +31,14 @@
         # atol is for small values: they have more difference, so set atol higher
         # rtol is for larger values: they are very close, so set rtol lower
         (torch.float32, 1e-0, 2e-6),
-        (torch.bfloat16, 1e4, 6e-3),
+        pytest.param(
+            torch.bfloat16,
+            1e4,
+            6e-3,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
     ],
 )
 def test_correctness(bsz, seq_len, hidden_size, intermediate_size, dtype, atol, rtol):
diff --git a/test/transformers/test_rms_norm.py b/test/transformers/test_rms_norm.py
index 8570f7cb4..e15e35162 100644
--- a/test/transformers/test_rms_norm.py
+++ b/test/transformers/test_rms_norm.py
@@ -1,5 +1,5 @@
 import os
-from test.utils import assert_verbose_allclose
+from test.utils import assert_verbose_allclose, supports_bfloat16
 
 import pytest
 import torch
@@ -71,7 +71,14 @@ def forward(self, x):
     "dtype, atol, rtol",
     [
         (torch.float32, 1e-4, 1e-6),
-        (torch.bfloat16, 2e-1, 2e-2),
+        pytest.param(
+            torch.bfloat16,
+            2e-1,
+            2e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
         (torch.float16, 2e-1, 2e-2),
     ],
 )
diff --git a/test/transformers/test_rope.py b/test/transformers/test_rope.py
index 4605be5e6..a0cad1ac5 100644
--- a/test/transformers/test_rope.py
+++ b/test/transformers/test_rope.py
@@ -1,3 +1,5 @@
+from test.utils import supports_bfloat16
+
 import pytest
 import torch
 from transformers.models.llama.modeling_llama import (
@@ -29,7 +31,14 @@
     "dtype, atol, rtol",
     [
         (torch.float32, 1e-5, 1e-5),
-        (torch.bfloat16, 1e-1, 1e-5),
+        pytest.param(
+            torch.bfloat16,
+            1e-1,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
     ],
 )
 def test_correctness(
diff --git a/test/transformers/test_swiglu.py b/test/transformers/test_swiglu.py
index 0b8ef3d45..7f3635242 100644
--- a/test/transformers/test_swiglu.py
+++ b/test/transformers/test_swiglu.py
@@ -1,3 +1,5 @@
+from test.utils import supports_bfloat16
+
 import pytest
 import torch
 from transformers.models.llama.configuration_llama import LlamaConfig
@@ -37,7 +39,14 @@
         # rtol is for larger values: they are very close, so set rtol lower
         (torch.float32, 1e-0, 1e-5),
         # TODO: we should find a better way to tune this. 1e4 is too large apparently
-        (torch.bfloat16, 1e4, 1e-2),
+        pytest.param(
+            torch.bfloat16,
+            1e4,
+            1e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
     ],
 )
 def test_correctness_llamamlp(
@@ -113,7 +122,14 @@ def test_correctness_llamamlp(
         # rtol is for larger values: they are very close, so set rtol lower
         (torch.float32, 1e-0, 1e-5),
         # TODO: we should find a better way to tune this. 1e4 is too large apparently
-        (torch.bfloat16, 1e4, 1e-2),
+        pytest.param(
+            torch.bfloat16,
+            1e4,
+            1e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
+        ),
     ],
 )
 def test_correctness_phi3mlp(
diff --git a/test/utils.py b/test/utils.py
index 2e0c109f5..cb66742e2 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -112,3 +112,9 @@ def simple_collate_fn(data: List[Dict[str, Any]]):
             "labels": labels,
         }
     )
+
+
+def supports_bfloat16():
+    if not torch.cuda.is_available():
+        return False
+    return torch.cuda.get_device_capability() >= (8, 0)  # Ampere and newer